def text_net(word_result,word_weight,weibo):#提取代表性微博_词网 #进行文本去重 text_list = [] for i in range(0,len(weibo)): row = dict() row['_id'] = i row['title'] = '' row['content'] = weibo[i][0].decode('utf-8') text_list.append(row) results = duplicate(text_list) new_weibo = [] for item in results: if not item['duplicate']: index = item['_id'] new_weibo.append(weibo[index]) max_n = int(len(new_weibo)*0.5) + 1 min_n = len(new_weibo)/len(word_result.keys()) + 1 #以下是提取每一类的代表性文本 text_total = dict() opinion_name = dict() start = time.time() title = dict() dur_time = dict() for k,v in word_result.iteritems(): text_list = get_text_net(v,new_weibo,word_weight,max_n,min_n) name = get_title(v,word_weight)#提取子话题名称 opinion_name[k] = '&'.join(name) text_total[k] = text_list return text_total,opinion_name
def text_net(word_result, word_weight, weibo): #提取代表性微博_词网 #进行文本去重 text_list = [] for i in range(0, len(weibo)): row = dict() row['_id'] = i row['title'] = '' row['content'] = weibo[i].decode('utf-8') text_list.append(row) results = duplicate(text_list) new_weibo = [] for item in results: if not item['duplicate']: index = item['_id'] new_weibo.append(weibo[index]) #以下是提取每一类的代表性文本 text_total = [] for k, v in word_result.iteritems(): text_list = get_text_net(v, new_weibo, word_weight) row_list = [] for text in text_list: if text not in row_list: row_list.append(text) text_total.append(row_list) return text_total
def get_commentContent(entity_name, score, index_name, type): query_body = { "size":200, "query":{ "bool":{ "must":{"match":{"content":entity_name}}, "should":[ {"match":{"em1":0}}, {"match":{"em1":1}} ], "minimum_should_match" : 1 } } } res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100) hits = res['hits']['hits'] results = [] if(len(hits)): for item in hits: name = item['_index'] if(item['_score'] >= score): if entity_name in item['_source']['content']: result = item['_source'] result.update({'source':name}) results.append(result) # 按发布时间排序 results.sort(key=lambda x: x['publish_time'], reverse=True) # 根据文本相似度去重 dup_results = duplicate(results) return dup_results
def all_source_match(news_id,keywords_list): for source,doc_type in TYPE1_DICT.iteritems(): result = match_topic_kw(news_id,keywords_list,source,doc_type) print len(result),'fintext in',source # 标记不重复文本 unique_result = duplicate(result) print len(unique_result),'unique fintext in',source save_topic_es(result,unique_result,index=TOPIC_ABOUT_INDEX,doc_type=source) print 'insert complete in',source
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def all_source_match(news_id, keywords_list): for source, doc_type in TYPE1_DICT.iteritems(): result = match_topic_kw(news_id, keywords_list, source, doc_type) print len(result), 'fintext in', source # 标记不重复文本 unique_result = duplicate(result) print len(unique_result), 'unique fintext in', source save_topic_es(result, unique_result, index=TOPIC_ABOUT_INDEX, doc_type=source) print 'insert complete in', source
def __init__(self,name): QWidgetSavePos.__init__(self,name) self.setMinimumSize(900, 600) self.setWindowIcon(QIcon_load("preferences-system")) self.setWindowTitle(_("Fit configure")+" (https://www.gpvdm.com)") self.main_vbox = QVBoxLayout() toolbar=QToolBar() toolbar.setIconSize(QSize(48, 48)) spacer = QWidget() spacer.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding) toolbar.addWidget(spacer) self.undo = QAction(QIcon_load("help"), _("Help"), self) self.undo.setStatusTip(_("Close")) self.undo.triggered.connect(self.callback_help) toolbar.addAction(self.undo) self.main_vbox.addWidget(toolbar) self.notebook = QTabWidget() self.notebook.setMovable(True) self.main_vbox.addWidget(self.notebook) files=[os.path.join(get_sim_path(),"fit.inp")] description=[_("Configure minimizer")] for i in range(0,len(files)): tab=tab_class() tab.init(files[i],description[i]) self.notebook.addTab(tab,description[i]) self.duplicate_window=duplicate() self.notebook.addTab(self.duplicate_window,_("Duplicate window")) self.fit_vars_window=fit_vars() self.notebook.addTab(self.fit_vars_window,_("Fit variable window")) self.constraints_window=constraints() self.notebook.addTab(self.constraints_window,_("Fit constraints")) self.setLayout(self.main_vbox)
def test_duplicate(self): rules = { 'A': ['B'], # 1 'B': ['C'], # 2 'C': ['D'], # 3 } content = [ 'A\t\tA B C D E', 'B\t\tB C A B D E' ] expected = sorted([ # O1 ('A', 'A B C D E'), # O1 -> 1 ('A', 'B B C D E'), # O1 -> 2 ('A', 'A C C D E'), # O1 -> 3 ('A', 'A B D D E'), # O1 -> 1 + 2 ('A', 'B C C D E'), # O1 -> 1 + 3 ('A', 'B B D D E'), # O1 -> 1 + 2 + 3 ('A', 'B C D D E'), # O1 -> 2 + 3 ('A', 'A C D D E'), # O2 ('B', 'B C A B D E'), # 02 -> 1 ('B', 'B C B B D E'), # 02 -> 2 ('B', 'C C A C D E'), # O2 -> 3 ('B', 'B D A B D E'), # O2 -> 1 + 2 ('B', 'C C B C D E'), # O2 -> 1 + 3 ('B', 'B D B B D E'), # O2 -> 2 + 3 ('B', 'C D A C D E'), # O2 -> 1 + 2 + 3 ('B', 'C D B C D E') ]) duplicated = sorted(duplicate.duplicate(content, rules)) self.assertEqual(expected, duplicated)
def domain_rec(sort_field='hot'): module_keywords = [("baidu_site_search", "domain_keywords_20150617.txt")] keywords_file_list = list(set([v for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "category": {"$in": keywords_file_list}, } count = mongo.boatcol.find(query_dict).count() print "domain news candidate %s count: " % sort_field, count results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = r['title'].encode('utf-8') if r['summary']: r['content'] = r['summary'].encode('utf-8') else: r['content'] = '' results = duplicate(results) fw = csv.writer(open(result_path + 'domain_news_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(BAIDU_SITE_KEYS) for r in results: csvrow = [] if r['duplicate'] == False: for key in BAIDU_SITE_KEYS: try: if key == "hit_set": item = _encode_utf8(','.join(r[key])) else: item = _encode_utf8(r[key]) csvrow.append(item) except KeyError: csvrow.append('') fw.writerow(csvrow)
def get_other_comment(entity_name, index_name, type, date, comment_date): t = int(time.mktime(time.strptime(comment_date, '%Y-%m-%d'))) start_time1 = datetime.strptime(comment_date, "%Y-%m-%d") - timedelta(days=int(date)) start_time2 = start_time1.strftime("%Y-%m-%d") start_time = int(time.mktime(time.strptime(start_time2, "%Y-%m-%d"))) query_body = { "size": 500, "query": { "bool": { "must": { "match": { "query_name": entity_name } } } } } if not comment_date == start_time2: query_body['query']['bool'].update({ "filter": [{ "range": { "publish_time": { "gt": start_time, "lte": t } } }] }) res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100) hits = res['hits']['hits'] results = [] if (len(hits)): for item in hits: index = item['_index'] if entity_name in item['_source']['query_name']: if not "ad01" in item['_source'] and not "em0" in item[ '_source'] and not "em1" in item['_source']: item['_source'].update({'source': index}) results.append(item['_source']) dup_results = duplicate(results) return dup_results
def domain_rec(sort_field='hot'): module_keywords = [("sogou_weixin_search", "keywords_domain_weixin.txt"), \ ("tianya_bbs_search", "keywords_domain_forum.txt"), \ ("xinhua_bbs_search", "keywords_domain_forum.txt"), \ ("baidu_ns_search", "keywords_domain_baidu.txt")] keywords_file_list = list(set([v for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "category": {"$in": keywords_file_list}, } count = mongo.boatcol.find(query_dict).count() print "domain news candidate %s count: " % sort_field, count results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = r['title'].encode('utf-8') r['content'] = r['summary'].encode('utf-8') results = duplicate(results) fw = csv.writer(open(result_path + 'domain_news_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(NEWS_KEYS) for r in results: csvrow = [] if r['duplicate'] == False: for key in NEWS_KEYS: csvrow.append(_encode_utf8(r[key])) fw.writerow(csvrow)
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def comments_rubbish_clustering_calculation(comments, logger, cluster_num=COMMENT_WORDS_CLUSTER_NUM, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 30 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['label', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' try: r['content168'] = r['content168'].encode('utf-8') except: r['content168'] = r['text'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'].encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # svm去除垃圾 items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) # 按新闻对评论归类 results = comment_news(inputs) final_inputs = [] for news_id, _inputs in results.iteritems(): # 结合新闻,过滤评论 _inputs = filter_comment(_inputs) inputs = [r for r in _inputs if r['rub_label'] == 0] inputs_rubbish = [r for r in _inputs if r['rub_label'] == 1] for r in inputs_rubbish: r['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(r) if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, \ cluster_num=cluster_num, version=version) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价, 权重及簇标签 recommend_text = cluster_evaluation(evaluation_inputs, min_size=cluster_eva_min_size) for label, items in recommend_text.iteritems(): if label != OTHER_CLUSTER_ID: clusters_infos['features'][label] = results[label] print '11111',results[label] for item in items: item['clusterid'] = label item['weight'] = item['weight'] final_inputs.extend(items) else: for item in items: item['clusterid'] = OTHER_CLUSTER_ID items_infos.extend(items) else: # 如果信息条数小于,则直接展示信息列表 tfidf_word, input_dict = tfidf_v2(inputs) uuid_label = str(uuid.uuid4()) clusters_infos['features'][uuid_label] = [kw for kw, count in tfidf_word] print '22222222',clusters_infos['features'][uuid_label] for r in inputs: r['clusterid'] = uuid_label r['weight'] = global_weight_cal_tfidf(tfidf_word, r) final_inputs.extend(inputs) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.iteritems(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' try: r['content168'] = r['content168'].encode('utf-8') except: r['content168'] = r['text'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}
def comments_rubbish_clustering_calculation(comments, logger, cluster_num=COMMENT_WORDS_CLUSTER_NUM, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 30 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['label', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'].encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # svm去除垃圾 items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) # 按新闻对评论归类 results = comment_news(inputs) final_inputs = [] for news_id, _inputs in results.iteritems(): # 结合新闻,过滤评论 _inputs = filter_comment(_inputs) inputs = [r for r in _inputs if r['rub_label'] == 0] inputs_rubbish = [r for r in _inputs if r['rub_label'] == 1] for r in inputs_rubbish: r['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(r) if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, \ cluster_num=cluster_num, version=version) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价, 权重及簇标签 recommend_text = cluster_evaluation(evaluation_inputs, min_size=cluster_eva_min_size) for label, items in recommend_text.iteritems(): if label != OTHER_CLUSTER_ID: clusters_infos['features'][label] = results[label] for item in items: item['clusterid'] = label item['weight'] = item['weight'] final_inputs.extend(items) else: for item in items: item['clusterid'] = OTHER_CLUSTER_ID items_infos.extend(items) else: # 如果信息条数小于,则直接展示信息列表 tfidf_word, input_dict = tfidf_v2(inputs) uuid_label = str(uuid.uuid4()) clusters_infos['features'][uuid_label] = [kw for kw, count in tfidf_word] for r in inputs: r['clusterid'] = uuid_label r['weight'] = global_weight_cal_tfidf(tfidf_word, r) final_inputs.extend(inputs) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.iteritems(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
def test_regular_input2(self): self.assertEqual(duplicate([1, 2, 3, 4, 2]), 2)
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] ts = int(task_detail[2]) wb = Workbook() ws = wb.create_sheet() print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] """ # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list """ #判断感知 # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 mid_value = dict() if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['timestamp'] = ts # es存储当前时段的信息 es_prediction.index(index=index_sensing_task, doc_type=type_sensing_task, id=ts, body=results) #print results #temp_titles = list(results.keys()) #temp_results = list(results.values()) #ws.append(temp_titles) #ws.append(temp_results) #wb.save('./temp/temp'+str(ts)+'.xlsx') #查找并展示经济类的相关微博 #eco_mid_list = get_economics_mids(mid_value) #size = 10 #get_origin_weibo_detail(ts,size,'retweeted') #print eco_mid_list #eco_weibos = get_weibo_content(index_list,eco_mid_list) #print eco_weibos #eco_content = eco_weibos['_source']['text'] #weibo_content = '' #for aaa in eco_weibos: #weibo_content += aaa['_source']['text']+'\n' #save_results(weibo_content,ts) return "1"
def social_sensing(task_detail): ''' with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) ''' # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] #ts = int(task_detail[2]) ts = float(task_detail[2]) #xnr_user_no = task_detail[3] print ts2date(ts) index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if es_text.indices.exists(index=flow_text_index_name_pre+ts2datetime(ts-2*DAY)): index_list.append(flow_text_index_name_pre+ts2datetime(ts-2*DAY)) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list, forward_1 = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list, current_3 = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type #statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count retweeted_weibo_detail[mid] = tmp #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) #current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 #current_retweeted_count = statistics_count['retweeted'] #current_comment_count = statistics_count['comment'] #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100])) # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} trendline_dict = dict() all_text_dict = dict() # 有事件发生时开始 if 1: print "index_list:", index_list if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 #classify_uid_list = [] classify_mid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() mid_ts_dict = dict() # 文本发布时间 uid_prediction_dict = dict() weibo_prediction_dict = dict() trendline_dict = dict() feature_prediction_list = [] # feature mid_prediction_list = [] # dui ying mid if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] mid_ts_dict[iter_mid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_mid_list.append(iter_mid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" mid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_mid_list, classify_text_dict) #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value #mid_value[k] = topic_value_dict[v[0]] mid_value[k]=v[0] #feature_list = organize_feature(k, mid_ts_dict[k]) #feature_prediction_list.append(feature_list) # feature list #mid_prediction_list.append(k) # corresponding # prediction """ print "start prediction" weibo_prediction_result = weibo_model.predict(feature_prediction_list) uid_prediction_result = uid_model.predict(feature_prediction_list) for i in range(len(mid_prediction_list)): if i % 100 == 0: print i uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i] weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]]) trendline_dict[mid_prediction_list[i]] = tmp_trendline """ # organize data mid_list = all_text_dict.keys() print "final mid:", len(mid_list) print "intersection: ", len(set(mid_list)&set(all_mid_list)) bulk_action = [] count = 0 for mid in mid_list: iter_dict = dict() if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 3 else: iter_dict["retweeted"] = 0 iter_dict["comment"] = 0 print "mid in all_mid_list: ", mid in set(all_mid_list) #iter_dict["trendline"] = json.dumps(trendline_dict[mid]) if duplicate_dict.has_key(mid): iter_dict["duplicate"] = duplicate_dict[mid] else: iter_dict["duplicate"] = "" #iter_dict["uid_prediction"] = uid_prediction_dict[mid] #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid] iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = mid_value[mid] iter_dict["detect_ts"] = ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[mid]) count += 1 #print 'iter_dict:::',iter_dict # _id = xnr_user_no + '_' + mid _id = mid bulk_action.extend([{"index":{"_id": _id}}, iter_dict]) if count % 500 == 0: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) bulk_action = [] if bulk_action: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) return "1"
def sheqi_rec(sort_field='hot'): module_keywords = [("sogou_weixin_search", "keywords_corp_weixin.txt"), \ ("sogou_weixin_search", "keywords_leader_weixin.txt"), \ ("sogou_weixin_search", "keywords_hot_weixin.txt"), \ ("baidu_ns_search", "keywords_corp_baidu.txt"), \ ("baidu_ns_search", "keywords_leader_baidu.txt"), \ ("baidu_ns_search", "keywords_hot_baidu.txt")] keywords_file_list = list(set([v for k, v in module_keywords])) source_website_list = list(set([k for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "category": {"$in": keywords_file_list}, "source_website": {"$in": source_website_list} } count = mongo.boatcol.find(query_dict).count() print "sheqi news candidate %s count: " % sort_field, count results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = r['title'].encode('utf-8') if r['summary']: r['content'] = r['summary'].encode('utf-8') else: r['content'] = '' results = duplicate(results) fw = csv.writer(open(result_path + 'sheqi_news_weixin_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(NEWS_KEYS) for r in results: csvrow = [] if r['duplicate'] == False: for key in NEWS_KEYS: if key in r: item = _encode_utf8(r[key]) else: item = "" csvrow.append(item) fw.writerow(csvrow) module_keywords = [("weibo_api_search_spider", "keywords_corp_weiboapi.txt"), \ ("weibo_api_search_spider", "keywords_leader_weiboapi.txt"), \ ("weibo_api_search_spider", "keywords_hot_weiboapi.txt"), \ ("tianya_bbs_search", "keywords_corp_forum.txt"), \ ("tianya_bbs_search", "keywords_leader_forum.txt"), \ ("tianya_bbs_search", "keywords_hot_forum.txt")] weibo_forum_results = [] keywords_file_list = list(set([v for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "source_website": "weibo_api_search_spider", "source_category": {"$in": keywords_file_list} } results = mongo.master_timeline_weibo.find(query_dict).sort(sort_field, pymongo.DESCENDING) weibo_forum_results.extend([r for r in results]) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "source_website": "tianya_bbs_search", "category": {"$in": keywords_file_list} } results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) weibo_forum_results.extend([r for r in results]) if sort_field == "rel_score": for r in weibo_forum_results: if sort_field not in r: r[sort_field] = 0 results = sorted(weibo_forum_results, key=lambda item: item[sort_field], reverse=True) for r in results: if 'title' in r: r['title'] = r['title'].encode('utf-8') else: r['title'] = ''.encode('utf-8') if 'summary' in r: r['content'] = r['summary'].encode('utf-8') else: r['content'] = r['text'].encode('utf-8') results = duplicate(results) fw = csv.writer(open(result_path + 'sheqi_weibo_forum_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') WEIBO_FORUM_KEYS = ['id', 'title', 'content', 'timestamp', 'created_at', 'source_website', 'source_category', 'hot', 'sensi', 'sentiment'] fw.writerow(WEIBO_FORUM_KEYS) for r in results: csvrow = [] if r['duplicate'] == False: for key in WEIBO_FORUM_KEYS: if 'created_at' == key and 'created_at' not in r and 'datetime' in r: r['created_at'] = r['datetime'] if 'source_category' == key and 'source_category' not in r and 'category' in r: r['source_category'] = r['category'] csvrow.append(_encode_utf8(r[key])) fw.writerow(csvrow)
def social_sensing(task_detail): # 任务名, 传感器, 任务创建时间(感知时间的起点) task_name = task_detail[0] social_sensors = task_detail[1] ts = float(task_detail[2]) print 'sensing_start_time:',ts2date(ts) index_list = ["flow_text_gangdu"] # 被感知的数据库,后期根据情况修改 # 前两天之内的原创、转发微博 list/retweeted (不包含当前一个小时) forward_origin_weibo_list, forward_1 = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 前一个小时内原创、转发微博 list/retweeted current_origin_weibo_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_weibo_list, current_3 = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_origin_weibo_list) all_mid_list.extend(current_retweeted_weibo_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_origin_weibo_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_weibo_list) all_retweeted_list.extend(forward_retweeted_weibo_list) #被转发微博的mid/root_mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type if all_origin_list: origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted_stat"] = retweet_count tmp["comment_stat"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted_stat"] = retweet_count tmp["comment_stat"] = comment_count retweeted_weibo_detail[mid] = tmp else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) # 有事件发生时开始,查询所有的 all_mid_list, 一小时+两天 if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) all_text_dict = dict() # 感知到的事, all_mid_list mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 classify_text_dict = dict() # 分类文本 sensitive_words_dict = dict() duplicate_text_list = [] classify_mid_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_mid_list.append(iter_mid) # 去重 print "start duplicate:",'----' if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] print '----', "duplicate finished:" # 分类 print "start classify:",'----' mid_value = dict() if classify_text_dict: classify_results = topic_classfiy(classify_mid_list, classify_text_dict) for k,v in classify_results.iteritems(): # mid:value mid_value[k]=v[0] print '----', "classify finished:" mid_list = all_text_dict.keys() mid_duplicate_list = set(duplicate_dict.keys())|set(duplicate_dict.values()) intersection_list = set(mid_list)-(set(duplicate_dict.keys())|set(duplicate_dict.values())) print "final mid:", len(mid_list) print "duplicate mid:", len(mid_duplicate_list) print "duplicate:", len(set(duplicate_dict.values())) print "single: ", len(intersection_list) # 将字典键值对倒过来 reverse_duplicate_dict = defaultdict(list) for k,v in duplicate_dict.iteritems(): reverse_duplicate_dict[v].append(k) for term in intersection_list: reverse_duplicate_dict[term] = [term] bulk_action = [] count = 0 for id in reverse_duplicate_dict.keys(): iter_dict = dict() inter_mid_list = [] inter_mid_list.append(id) inter_mid_list.extend(reverse_duplicate_dict[id]) # 计算发起者 timestamp_list = [] for mid in inter_mid_list: timestamp_list.append(all_text_dict[mid]['timestamp']) mid_initial = inter_mid_list[timestamp_list.index(min(timestamp_list))] # 计算推动者 push_list = [] for mid in inter_mid_list: if origin_weibo_detail.has_key(mid): retweeted_stat = origin_weibo_detail[mid]['retweeted_stat'] elif retweeted_weibo_detail.has_key(mid): retweeted_stat = retweeted_weibo_detail[mid] else: retweeted_stat = 0 push_list.append(retweeted_stat) mid_push = inter_mid_list[push_list.index(max(push_list))] mid = mid_push if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) # update 函数把字典dict2的键/值对更新到dict里 iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 0 else: iter_dict["retweeted_stat"] = 0 iter_dict["comment_stat"] = 0 iter_dict["type"] = -1 # iter_dict["name"] = '' # iter_dict["heat"] = iter_dict["retweeted_stat"] + iter_dict["comment_stat"] iter_dict["status"] = 0 # 是否加入监测 iter_dict["delete"] = 0 # 是否删除 iter_dict["topic_field"] = eng2chi_dict[mid_value[mid]] # 分类标签 iter_dict["detect_ts"] = ts # 感知开始时间 iter_dict["initiator"] = all_text_dict[mid_initial]['uid'] # 发起者 iter_dict["push"] = all_text_dict[mid_push]['uid'] # 发起者 iter_dict.update(all_text_dict[mid]) count += 1 _id = mid bulk_action.extend([{"index":{"_id": _id}}, iter_dict]) if count % 500 == 0: es_sensor.bulk(bulk_action, index=index_content_sensing, doc_type=type_content_sensing, timeout=600) bulk_action = [] if bulk_action: es_sensor.bulk(bulk_action, index=index_content_sensing, doc_type=type_content_sensing) return "1"
matrix = initialize(m, n, matrix) #input in matrix as float or int type for i in range(m): for j in range(n): print('entry in row: ', i + 1, ' column: ', j + 1) #applying try and except if user gives an input other than prescribed type while True: try: matrix[i][j] = float(input()) break except ValueError: print( "no special characters are allowed \nplease enter valid float or integer type value" ) return matrix ''' calling all functions seperately from main as well from other files''' #calling entry1 m, n = entry1() #calling entry2 mat = entry2(m, n, matrix) #initialzing l2 and creating a deep copy of matrix in l2 l2 = intialize(m, n, l2) l2 = duplicate(m, n, mat, l2) ans = logic(m, n, l2) print(ans)
def enemy_rec(sort_field='hot'): keywords = get_keywords('keywords_enemy_baidu.txt') module_keywords = [("sogou_weixin_search", "keywords_enemy_weixin.txt"), \ ("tianya_bbs_search", "keywords_enemy_forum.txt"), \ ("xinhua_bbs_search", "keywords_enemy_forum.txt"), \ ("baidu_ns_search", "keywords_enemy_baidu.txt")] keywords_file_list = list(set([v for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "category": {"$in": keywords_file_list}, } count = mongo.boatcol.find(query_dict).count() print "enemy news candidate %s count: " % sort_field, count results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = r['title'].encode('utf-8') r['content'] = r['summary'].encode('utf-8') results = duplicate(results) fw = csv.writer(open(result_path + 'enemy_news_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(NEWS_KEYS) # 按对手组织文本 corp_news_dict = dict() for r in results: csvrow = [] if r['duplicate'] == False: for key in NEWS_KEYS: csvrow.append(_encode_utf8(r[key])) fw.writerow(csvrow) if sort_field == 'hot': text = r['title'] + r['summary'].encode('utf-8') for keyword in keywords: if keyword in text: try: corp_news_dict[keyword].append(r) except KeyError: corp_news_dict[keyword] = [r] if corp_news_dict != {}: fw = csv.writer(open(result_path + 'enemy_news_gongsi_%s_%s.csv' % (START_DATETIME, END_DATETIME), 'wb'), delimiter='^') new_keys = ['gongsi'] new_keys += NEWS_KEYS fw.writerow(new_keys) for corp, news_dict in corp_news_dict.iteritems(): for r in news_dict: csvrow = [corp] for key in NEWS_KEYS: csvrow.append(_encode_utf8(r[key])) fw.writerow((csvrow))
def social_sensing(): all_fid_list, end_ts = count_statis() if S_TYPE == 'test': all_fid_list = ALL_FID_LIST index_list = [] for i in range(7): timestamp = end_ts - i * DAY flow_text_index_name = flow_text_index_name_pre + ts2datetime( timestamp) index_list.append(flow_text_index_name) #index_list = [flow_text_index_name_pre+date_1,flow_text_index_name_pre+date_2] print 'index_list...', index_list # 感知到的事, all_fid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 fid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] classify_fid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} all_text_dict = dict() fid_ts_dict = dict() # 文本发布时间 # 有事件发生时开始 #if 1: if index_list and all_fid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "fid": all_fid_list } } } }, "size": 5000 } search_results = es.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search fid len: ", len(search_results) if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_fid = item['_source']['fid'] fid_ts_dict[iter_fid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_fid] = tmp_text duplicate_text_list.append({ "_id": iter_fid, "title": "", "content": iter_text.decode("utf-8", 'ignore') }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_fid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_fid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_fid_list.append(iter_fid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" fid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_fid_list, classify_text_dict) #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # fid:value #fid_value[k] = topic_value_dict[v[0]] fid_value[k] = v[0] # organize data fid_list = all_text_dict.keys() print "final fid:", len(fid_list) print "intersection: ", len(set(fid_list) & set(all_fid_list)) bulk_action = [] count = 0 #social_sensing_index_name = "fb_social_sensing_text_" + ts2datetime(end_ts) social_sensing_index_name = "fb_social_sensing_text" mappings_social_sensing_text(social_sensing_index_name) for fid in fid_list: iter_dict = dict() if duplicate_dict.has_key(fid): iter_dict["duplicate"] = duplicate_dict[fid] else: iter_dict["duplicate"] = "" iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = fid_value[fid] iter_dict["detect_ts"] = end_ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[fid]) count += 1 print 'iter_dict:::', iter_dict # _id = xnr_user_no + '_' + fid bulk_action.extend([{"index": {"_id": fid}}, iter_dict]) if count % 500 == 0: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) bulk_action = [] if bulk_action: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend( forward_retweeted_weibo_list) #被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo( ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len( all_mid_list) * AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len( all_mid_list) * AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "mid": all_mid_list } } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode( 'utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({ "_id": iter_mid, "title": "", "content": iter_text }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads( item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo( ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def cluster_same_topic(ts): # output: list [[mid, mid, mid]] # 1. 获取同一类别的微博 topic_list = topic_en2ch_dict.keys() topic_list.remove("life") topic_list.remove("art") print topic_list #for topic in topic_list: if 1: s_re = scan(es_user_portrait, query={"query":{"terms":{"category":topic_list}},"size":1000}, index=monitor_index_name, doc_type=monitor_index_type) text_list = [] topic_mid_list = [] while 1: try: tmp = s_re.next() scan_re = tmp['_source'] detection_type = scan_re.get("detection", 0) if int(detection_type) == 1: continue mid = scan_re['mid'] text = scan_re['text'] temp_dict = dict() #temp_dict["mid"] = mid #temp_dict["text"] = text #text_list.append(temp_dict) temp_dict['_id'] = mid temp_dict['content'] = re.sub(r'http://\S+', '', text) temp_dict['title'] = "" text_list.append(temp_dict) except StopIteration: #print topic, "iter once and begin cluster" if len(text_list) == 1: #top_word = freq_word(text_list[0]) #topic_list = [top_word.keys()] topic_mid_list.append(text_list[0]) elif len(text_list) == 0: topic_list = [] #print "no relate weibo text" else: results = duplicate(text_list) dup_results = dict() for item in results: if item['duplicate']: dup_list = dup_results[item['same_from']] dup_list.append(item["_id"]) dup_results[item['same_from']] = dup_list else: dup_results[item['_id']] = [item['_id']] topic_mid_list.extend(dup_results.values()) """ feature_words, input_word_dict = tfidf(text_list) #生成特征词>和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) events_dict = dict() print inputs for item in inputs: if not item.has_key('label'): continue cluster_id = item['label'] mid = item['mid'] if events_dict.has_key(cluster_id): mid_list = events_dict[cluster_id] mid_list.append(mid) events_dict[cluster_id] = mid_list else: events_dict[cluster_id] = [mid] if events_dict.values(): topic_mid_list.extend(events_dict.values()) #clustering_topic = cluster_evaluation(inputs) #print "clustering weibo topic" #sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) """ break return topic_mid_list
def sheqi_rec(sort_field='hot'): module_keywords = [("sogou_weixin_search", "keywords_corp_weixin.txt"), \ ("sogou_weixin_search", "keywords_leader_weixin.txt"), \ ("sogou_weixin_search", "keywords_hot_weixin.txt"), \ ("tianya_bbs_search", "keywords_corp_forum.txt"), \ ("tianya_bbs_search", "keywords_leader_forum.txt"), \ ("tianya_bbs_search", "keywords_hot_forum.txt"), \ ("baidu_ns_search", "keywords_corp_baidu.txt"), \ ("baidu_ns_search", "keywords_leader_baidu.txt"), \ ("baidu_ns_search", "keywords_hot_baidu.txt")] keywords_file_list = list(set([v for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "category": {"$in": keywords_file_list}, } count = mongo.boatcol.find(query_dict).count() print "sheqi news candidate %s count: " % sort_field, count results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = r['title'].encode('utf-8') if r['summary']: r['content'] = r['summary'].encode('utf-8') else: r['content'] = '' results = duplicate(results) fw = csv.writer(open(result_path + 'sheqi_news_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(NEWS_KEYS) for r in results: csvrow = [] if r['duplicate'] == False: for key in NEWS_KEYS: csvrow.append(_encode_utf8(r[key])) fw.writerow(csvrow) module_keywords = [("weibo_api_search_spider", "keywords_corp_weiboapi.txt"), \ ("weibo_api_search_spider", "keywords_leader_weiboapi.txt"), \ ("weibo_api_search_spider", "keywords_hot_weiboapi.txt")] keywords_file_list = list(set([v for k, v in module_keywords])) query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True, "rubbish": False, "source_category": {"$in": keywords_file_list}, } count = mongo.master_timeline_weibo.find(query_dict).count() print "sheqi weibo candidate %s count: " % sort_field, count results = mongo.master_timeline_weibo.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = ''.encode('utf-8') r['content'] = r['text'].encode('utf-8') results = duplicate(results) fw = csv.writer(open(result_path + 'sheqi_weibo_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(WEIBO_KEYS) for r in results: csvrow = [] if r['duplicate'] == False: for key in WEIBO_KEYS: csvrow.append(_encode_utf8(r[key])) fw.writerow(csvrow)
def test_regular_input(self): self.assertEqual(duplicate([1, 2, 3, 4, 4]), 4)
def get_adContent(entity_name, score, index_name, type, date, ad123, ad_date): t = int(time.mktime(time.strptime(ad_date, '%Y-%m-%d'))) start_time1 = datetime.strptime(ad_date, "%Y-%m-%d") - timedelta(days=int(date)) start_time2 = start_time1.strftime("%Y-%m-%d") start_time = int(time.mktime(time.strptime(start_time2, '%Y-%m-%d'))) score = 0 query_body = { "size": 500, "query": { "bool": { "must": { "match": { "query_name": entity_name } }, "should": [], "minimum_should_match": 1 } } } if ad123 == 0: query_body['query']['bool']['should'].append({"match": {"ad123": 1}}) query_body['query']['bool']['should'].append({"match": {"ad123": 2}}) query_body['query']['bool']['should'].append({"match": {"ad123": 3}}) elif ad123 == 1: query_body['query']['bool']['should'].append({"match": {"ad123": 1}}) elif ad123 == 2: query_body['query']['bool']['should'].append({"match": {"ad123": 2}}) query_body['query']['bool']['should'].append({"match": {"ad123": 3}}) if not ad_date == start_time2: query_body['query']['bool'].update({ "filter": [{ "range": { "publish_time": { "gt": start_time, "lte": t } } }] }) print(query_body) res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100) hits = res['hits']['hits'] results = [] if (len(hits)): for item in hits: name = item['_index'] _id = item['_id'] if (item['_score'] >= score): if entity_name in item['_source']['query_name']: result = item['_source'] result.update({'source': name}) result.update({'_id': _id}) results.append(result) # 按发布时间排序 #results.sort(key=lambda x: x['publish_time'], reverse=True) # 根据文本相似度去重 dup_results = duplicate(results) return dup_results
def comments_rubbish_clustering_calculation(comments, cluster_num, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 直接显示的clusterid DIRECT_CLUSTER_ID = 'direct' DIRECT_CLUSTER_FEATURE = [u'聚簇'] # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 20 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 print('\tData preprocess...') inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content'] #.encode('utf-8') r['content'] = r['content168'] r['text'] = r['content'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'] #.encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) print('\tAd filter %d data, data list have: %d' % (len(inputs), len(items_infos))) # svm去除垃圾 print('\tSvm rubbish classify...') if len(inputs) == 0: items = [] else: items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) print('\tSvm rubbish classify %d data, data list have: %d' % (len(inputs), len(items_infos))) #开始聚类 print('\tStart clustring opinion...') opinion_name, word_result, text_list, word_main = opinion_main( inputs, cluster_num) # if len(inputs) >= 500: # opinion_name,word_result,text_list = opinion_main(inputs,10) # else: # opinion_name,word_result,text_list = opinion_main(inputs,5) print('\tEnd clustring opinion...') for k, v in word_result.items(): #name = opinion_name[k] clusters_infos['features'][k] = v clusters_infos['word_main'] = word_main final_inputs = [] for k, v in text_list.items(): for item in v: row = copy.deepcopy(item) row['clusterid'] = k final_inputs.append(row) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.items(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}