def tag_vector(uid, date): date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date result = [] try: bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: tag = influence_tag["0"] result.append(tag) return result origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"]) retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"]) origin_comment = json.loads(bci_result["origin_weibo_comment_detail"]) retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"]) sum_retweeted = sum(origin_retweeted.values()) + sum(origin_comment.values()) sum_comment = sum(retweeted_retweeted.values()) + sum(retweeted_comment.values()) if sum_retweeted >= retweeted_threshold: if sum_comment >= comment_threshold: tag = influence_tag['3'] else: tag = influence_tag['1'] else: if sum_comment >= comment_threshold: tag = influence_tag['2'] else: tag = influence_tag['4'] result.append(tag) return result
def statistics_influence_people(uid, date, style, sensitive=0): # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution results = {} # retwweted weibo people and comment weibo people date1 = str(date).replace('-', '') index_name = pre_index + date1 print index_name index_flow_text = pre_text_index + date try: bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: bci_result = [] return results origin_mid = [] # origin weibo mid retweeted_mid = [] # retweeted weibo mid query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size":1000 } if sensitive: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"range":{"sensitive":{"gt":0}}}) body_1 = copy.deepcopy(query_body) body_2 = copy.deepcopy(query_body) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}]) result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"] if result_1: for item in result_1: origin_mid.append(item['_id']) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}]) result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"] if result_2: for item in result_2: if item['_source'].get('root_mid', ''): retweeted_mid.append(item['_source']['root_mid']) if int(style) == 0: # retweeted retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3) results = retweeted_results else: comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2) results = comment_results return results
def comment_on_influence(uid, date): date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date result = [] underline = [] try: bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: description = CURRENT_INFLUENCE_CONCLUSION['0'] result.append(description) return ([result, underline]) user_index = bci_result['user_index'] if user_index < CURRNET_INFLUENCE_THRESHOULD[0]: description = CURRENT_INFLUENCE_CONCLUSION['0'] elif user_index >= CURRNET_INFLUENCE_THRESHOULD[0] and user_index < CURRNET_INFLUENCE_THRESHOULD[1]: description = CURRENT_INFLUENCE_CONCLUSION['1'] elif user_index >= CURRNET_INFLUENCE_THRESHOULD[1] and user_index < CURRNET_INFLUENCE_THRESHOULD[2]: description = CURRENT_INFLUENCE_CONCLUSION['2'] elif user_index >= CURRNET_INFLUENCE_THRESHOULD[2] and user_index < CURRNET_INFLUENCE_THRESHOULD[3]: description = CURRENT_INFLUENCE_CONCLUSION['3'] elif user_index >= CURRNET_INFLUENCE_THRESHOULD[3] and user_index < CURRNET_INFLUENCE_THRESHOULD[4]: description = CURRENT_INFLUENCE_CONCLUSION['4'] else: description = CURRENT_INFLUENCE_CONCLUSION['5'] result.append(description) for i in range(4): if bci_result[INFLUENCE_TOTAL_LIST[i]] > INFLUENCE_TOTAL_THRESHOULD[i]: result.append(INFLUENCE_TOTAL_CONCLUSION[i]) if bci_result[INFLUENCE_BRUST_LIST[i]] > INFLUENCE_BRUST_THRESHOULD[i]: result.append(INFLUENCE_BRUST_CONCLUSION[i]) underline.append(UNDERLINE_CONCLUSION[i]) else: result.append('') underline.append('') else: result.extend(['','']) underline.append('') return [result, underline]
def bci_detail(date, uid, sensitive=0): if not sensitive: bci_index = "bci_" + date.replace("-", "") try: bci_result = es_bci.get(index=bci_index, doc_type="bci", id=uid)["_source"] except: bci_result = dict() try: origin_retweeted = json.loads(bci_result.get("origin_weibo_retweeted_detail", [])) except: origin_retweeted = [] origin_weibo_retweeted_brust_average = bci_result.get("origin_weibo_retweeted_brust_average", 0) # 爆发数 try: origin_comment = json.loads(bci_result.get("origin_weibo_comment_detail", [])) except: origin_comment = [] origin_weibo_comment_brust_average = bci_result.get("origin_weibo_comment_brust_average", 0) try: retweeted_retweeted = json.loads(bci_result.get("retweeted_weibo_retweeted_detail", [])) except: retweeted_retweeted = [] retweeted_weibo_retweeted_brust_average = bci_result.get("retweeted_weibo_retweeted_brust_average", 0) try: retweeted_comment = json.loads(bci_result.get("retweeted_weibo_comment_detail", [])) except: retweeted_comment = [] retweeted_weibo_comment_brust_average = bci_result.get("retweeted_weibo_comment_brust_average", 0) origin_query = query_body(1, uid) text_index = "flow_text_" + date if not sensitive: origin_text = es_text.search(index=text_index, doc_type="text", body=origin_query)["hits"]["hits"] else: sensitive_origin_query = origin_query["query"]["filtered"]["filter"]["bool"]["must"].append( {"range": {"sensitive": {"gt": 0}}} ) origin_text = es_text.search(index=text_index, doc_type="text", body=sensitive_origin_query)["hits"]["hits"] # print origin_text retweeted_query = query_body(3, uid) if not sensitive: retweeted_text = es_text.search(index=text_index, doc_type="text", body=retweeted_query)["hits"]["hits"] else: sensitive_retweeted_query = retweeted_query["query"]["filtered"]["filter"]["bool"]["must"].append( {"range": {"sensitive": {"gt": 0}}} ) retweeted_text = es_text.search(index=text_index, doc_type="text", body=sensitive_retweeted_query)["hits"][ "hits" ] origin_weibo_number = len(origin_text) # 1 retweeted_weibo_number = len(retweeted_text) # 2 retweet_total_number = 0 # 转发总数 comment_total_number = 0 # 评论总数 origin_retweet_total_number = 0 # 原创被转发总数 origin_comment_total_number = 0 # 原创被评论总数 retweet_retweet_total_number = 0 # 转发被转发总数 retweet_comment_total_number = 0 # 转发被评论总数 origin_retweet_average_number = 0 # 原创被转发平均数 origin_comment_average_number = 0 # 原创被评论平均数 retweet_retweet_average_number = 0 # 转发被转发平均数 retweet_comment_average_number = 0 # 转发被评论平均数 origin_retweet_top_number = 0 # 原创被转发最高 origin_comment_top_number = 0 # 原创被评论最高 retweet_retweet_top_number = 0 # 转发被转发最高 retweet_comment_top_number = 0 # 转发被评论最高 origin_sensitive_words_dict = dict() retweeted_sensitive_words_dict = dict() for item in origin_text: retweet_total_number += item["_source"].get("retweeted", 0) comment_total_number += item["_source"].get("comment", 0) origin_retweet_total_number += item["_source"].get("retweeted", 0) origin_comment_total_number += item["_source"].get("comment", 0) if origin_retweet_top_number < item["_source"].get("retweeted", 0): origin_retweet_top_number = item["_source"].get("retweeted", 0) if origin_comment_top_number < item["_source"].get("comment", 0): origin_comment_top_number = item["_source"].get("comment", 0) if sensitive: sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"]) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: origin_sensitive_words_dict[k] += v except: origin_sensitive_words_dict[k] = v for item in retweeted_text: retweet_total_number += item["_source"].get("retweeted", 0) comment_total_number += item["_source"].get("comment", 0) retweet_retweet_total_number += item["_source"].get("retweeted", 0) retweet_comment_total_number += item["_source"].get("comment", 0) if retweet_retweet_top_number < item["_source"].get("retweeted", 0): retweeet_retweet_top_number = item["_source"].get("retweeted", 0) if retweet_comment_top_number < item["_source"].get("comment", 0): retweet_comment_top_number = item["_source"].get("comment", 0) if sensitive: sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"]) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: retweeted_sensitive_words_dict[k] += v except: retweeted_sensitive_words_dict[k] = v try: average_retweet_number = retweet_total_number / (origin_weibo_number + retweeted_weibo_number) # 平均转发数 except: average_retweet_number = 0 try: average_comment_number = comment_total_number / (origin_weibo_number + retweeted_weibo_number) # 平均评论数 except: average_comment_number = 0 try: origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number except: origin_retweet_average_number = 0 try: origin_comment_average_number = origin_comment_total_number / origin_weibo_number except: origin_comment_average_number = 0 try: retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number except: retweet_retweet_average_number = 0 try: retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number except: retweet_comment_average_number = 0 result = dict() result["origin_weibo_number"] = origin_weibo_number result["retweeted_weibo_number"] = retweeted_weibo_number result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number result["origin_weibo_comment_total_number"] = origin_comment_total_number result["retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number result["retweeted_weibo_comment_total_number"] = retweet_comment_total_number result["origin_weibo_retweeted_average_number"] = origin_retweet_average_number result["origin_weibo_comment_average_number"] = origin_comment_average_number result["retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number result["retweeted_weibo_comment_average_number"] = retweet_comment_average_number result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number result["origin_weibo_comment_top_number"] = origin_comment_top_number result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number if not sensitive: result["origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average result["origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average result["retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average result["retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average result["user_index"] = bci_result.get("user_index", 0) else: result["retweeted_sensitive_words_list"] = sorted( retweeted_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True ) result["origin_sensitive_words_list"] = sorted( origin_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True ) result["retweeted_sensitive_words_number"] = len(retweeted_sensitive_words_dict) result["origin_sensitive_words_number"] = len(origin_sensitive_words_dict) return result
def bci_detail(date, uid, sensitive=0): if not sensitive: bci_index = "bci_" + date.replace('-', '') try: bci_result = es_bci.get(index=bci_index, doc_type="bci", id=uid)['_source'] except: bci_result = dict() try: origin_retweeted = json.loads( bci_result.get("origin_weibo_retweeted_detail", [])) except: origin_retweeted = [] origin_weibo_retweeted_brust_average = bci_result.get( "origin_weibo_retweeted_brust_average", 0) # 爆发数 try: origin_comment = json.loads( bci_result.get("origin_weibo_comment_detail", [])) except: origin_comment = [] origin_weibo_comment_brust_average = bci_result.get( "origin_weibo_comment_brust_average", 0) try: retweeted_retweeted = json.loads( bci_result.get("retweeted_weibo_retweeted_detail", [])) except: retweeted_retweeted = [] retweeted_weibo_retweeted_brust_average = bci_result.get( 'retweeted_weibo_retweeted_brust_average', 0) try: retweeted_comment = json.loads( bci_result.get("retweeted_weibo_comment_detail", [])) except: retweeted_comment = [] retweeted_weibo_comment_brust_average = bci_result.get( 'retweeted_weibo_comment_brust_average', 0) origin_query = query_body(1, uid) text_index = "flow_text_" + date if not sensitive: origin_text = es_text.search(index=text_index, doc_type="text", body=origin_query)["hits"]["hits"] else: sensitive_origin_query = origin_query["query"]["filtered"]["filter"][ "bool"]["must"].append({"range": { "sensitive": { "gt": 0 } }}) origin_text = es_text.search( index=text_index, doc_type="text", body=sensitive_origin_query)["hits"]["hits"] #print origin_text retweeted_query = query_body(3, uid) if not sensitive: retweeted_text = es_text.search(index=text_index, doc_type="text", body=retweeted_query)["hits"]["hits"] else: sensitive_retweeted_query = retweeted_query["query"]["filtered"][ "filter"]["bool"]["must"].append( {"range": { "sensitive": { "gt": 0 } }}) retweeted_text = es_text.search( index=text_index, doc_type="text", body=sensitive_retweeted_query)["hits"]["hits"] origin_weibo_number = len(origin_text) # 1 retweeted_weibo_number = len(retweeted_text) #2 retweet_total_number = 0 # 转发总数 comment_total_number = 0 # 评论总数 origin_retweet_total_number = 0 # 原创被转发总数 origin_comment_total_number = 0 # 原创被评论总数 retweet_retweet_total_number = 0 # 转发被转发总数 retweet_comment_total_number = 0 # 转发被评论总数 origin_retweet_average_number = 0 # 原创被转发平均数 origin_comment_average_number = 0 # 原创被评论平均数 retweet_retweet_average_number = 0 # 转发被转发平均数 retweet_comment_average_number = 0 # 转发被评论平均数 origin_retweet_top_number = 0 # 原创被转发最高 origin_comment_top_number = 0 # 原创被评论最高 retweet_retweet_top_number = 0 # 转发被转发最高 retweet_comment_top_number = 0 # 转发被评论最高 origin_sensitive_words_dict = dict() retweeted_sensitive_words_dict = dict() for item in origin_text: retweet_total_number += item['_source'].get('retweeted', 0) comment_total_number += item['_source'].get('comment', 0) origin_retweet_total_number += item['_source'].get('retweeted', 0) origin_comment_total_number += item['_source'].get('comment', 0) if origin_retweet_top_number < item['_source'].get('retweeted', 0): origin_retweet_top_number = item['_source'].get('retweeted', 0) if origin_comment_top_number < item['_source'].get('comment', 0): origin_comment_top_number = item['_source'].get('comment', 0) if sensitive: sensitive_words_dict = json.loads( item['_source']['sensitive_words_dict']) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: origin_sensitive_words_dict[k] += v except: origin_sensitive_words_dict[k] = v for item in retweeted_text: retweet_total_number += item['_source'].get('retweeted', 0) comment_total_number += item['_source'].get('comment', 0) retweet_retweet_total_number += item['_source'].get('retweeted', 0) retweet_comment_total_number += item['_source'].get('comment', 0) if retweet_retweet_top_number < item['_source'].get('retweeted', 0): retweeet_retweet_top_number = item['_source'].get('retweeted', 0) if retweet_comment_top_number < item['_source'].get('comment', 0): retweet_comment_top_number = item['_source'].get('comment', 0) if sensitive: sensitive_words_dict = json.loads( item['_source']['sensitive_words_dict']) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: retweeted_sensitive_words_dict[k] += v except: retweeted_sensitive_words_dict[k] = v try: average_retweet_number = retweet_total_number / ( origin_weibo_number + retweeted_weibo_number) # 平均转发数 except: average_retweet_number = 0 try: average_comment_number = comment_total_number / ( origin_weibo_number + retweeted_weibo_number) # 平均评论数 except: average_comment_number = 0 try: origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number except: origin_retweet_average_number = 0 try: origin_comment_average_number = origin_comment_total_number / origin_weibo_number except: origin_comment_average_number = 0 try: retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number except: retweet_retweet_average_number = 0 try: retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number except: retweet_comment_average_number = 0 result = dict() result["origin_weibo_number"] = origin_weibo_number result["retweeted_weibo_number"] = retweeted_weibo_number result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number result["origin_weibo_comment_total_number"] = origin_comment_total_number result[ "retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number result[ "retweeted_weibo_comment_total_number"] = retweet_comment_total_number result[ "origin_weibo_retweeted_average_number"] = origin_retweet_average_number result[ "origin_weibo_comment_average_number"] = origin_comment_average_number result[ "retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number result[ "retweeted_weibo_comment_average_number"] = retweet_comment_average_number result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number result["origin_weibo_comment_top_number"] = origin_comment_top_number result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number if not sensitive: result[ "origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average result[ "origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average result[ "retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average result[ "retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average result['user_index'] = bci_result.get('user_index', 0) else: result["retweeted_sensitive_words_list"] = sorted( retweeted_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True) result["origin_sensitive_words_list"] = sorted( origin_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True) result["retweeted_sensitive_words_number"] = len( retweeted_sensitive_words_dict) result["origin_sensitive_words_number"] = len( origin_sensitive_words_dict) return result