def Facebooks_mappings(): import time db = MySQLdb.connect(host="localhost",user="******",passwd="",db="db_F",charset='utf8') cursor = db.cursor() cursor.execute("SELECT * from FIDscouting") rows = cursor.fetchall() bulk_action_all = {} count_dict = {} count_i = 0 for row in rows: # print 'row[4].....',row[4] # print 'row[4].....',type(row[4]) # print '###########',int(time.mktime(row[4].timetuple())) #print 'row[3]......',row[3] action = {'index':{'_id':row[2].split('_')[1]}} source_item = {} source_item['uid'] = str(row[1]) source_item['fid'] = row[2].split('_')[1] source_item['text'] = row[3].decode("utf-8", "replace") source_item['timestamp'] = int(time.mktime(row[4].timetuple())) source_item['update_time'] = int(time.mktime(row[5].timetuple())) date_time = ts2datetime(int(time.mktime(row[4].timetuple()))) try: bulk_action_all[date_time].extend([action,source_item]) count_dict[date_time] += 1 count_i += 1 except: bulk_action_all[date_time] = [action,source_item] count_dict[date_time] = 1 count_i += 1 for date, count in count_dict.iteritems(): if count % 1000 == 0: index_name = facebook_flow_text_index_name_pre + date if not es.indices.exists(index=index_name): facebook_flow_text_mappings(index_name) if bulk_action_all[date]: es.bulk(bulk_action_all[date],index=index_name,doc_type=facebook_flow_text_index_type,\ timeout=600) bulk_action_all[date] = [] if count_i % 1000 == 0: print count_i for date, bulk_action in bulk_action_all.iteritems(): if bulk_action: index_name = facebook_flow_text_index_name_pre + date if not es.indices.exists(index=index_name): facebook_flow_text_mappings(index_name) es.bulk(bulk_action_all[date],index=index_name,doc_type=facebook_flow_text_index_type,timeout=600)
def match_flow_text(current_date): ''' #mapping有误,暂时不创建及存储数据,2018-4-13 17:11:51 new_xnr_flow_text_index_name = new_xnr_flow_text_index_name_pre + current_date new_weibo_xnr_flow_text_mappings(new_xnr_flow_text_index_name) ''' flow_text_index_name = new_fb_xnr_flow_text_index_name_pre + current_date query_body = {'query': {'match_all': {}}, 'size': MAX_VALUE} try: search_results = es_xnr.search(index=fb_xnr_index_name,doc_type=fb_xnr_index_type,\ body=query_body)['hits']['hits'] bulk_action = [] uid_list = [] xnr_user_no_list = [] count = 0 for result in search_results: result = result['_source'] if result.has_key('uid'): uid_list.append(result['uid']) xnr_user_no_list.append(result['xnr_user_no']) # print uid_list # print xnr_user_no_list #filter_keywords = {uid1:{mid1:{k:v, ...}...}...} filter_keywords = get_filter_keywords_for_match_function( [flow_text_index_name], uid_list) for uid, content in filter_keywords.items(): mid_list = [] mid_weibo = {} #({mid1:{'key1':f1,'key2':f2...}...}) for mid, keywords_dict in content.items(): for k, v in keywords_dict.items(): keywords_dict[k.encode('utf-8', 'ignore')] = v mid_list.append(mid) mid_weibo[mid] = keywords_dict #mid_topic_dict {mid1:{'art':0.1,'social':0.2...}...} #mid_topic_list {mid1:['art','social','media']...} mid_topic_dict, mid_topic_list = topic_classfiy( mid_list, mid_weibo) for mid, topic_dict in mid_topic_dict.items(): match_item = { 'topic_field_first': topic_en2ch_dict[mid_topic_list[mid][0]], 'topic_field': '&'.join(mid_topic_list[mid]), 'xnr_user_no': xnr_user_no_list[uid_list.index(uid)], } action = {'update': {'_id': mid}} bulk_action.extend([action, {'doc': match_item}]) if bulk_action: es_xnr.bulk(bulk_action, index=flow_text_index_name, doc_type=new_fb_xnr_flow_text_index_type, timeout=600) except Exception, e: #print e return 'no tweets to update today'
def create_active_user(): now_time = int(time.time()) - DAY result_list = lookup_active_weibouser(now_time) weibo_active_user_index_name = weibo_active_user_index_name_pre + ts2datetime( now_time) count = 0 bulk_create_action = [] if result_list: for item in result_list: create_action = {'index': {'_id': item['uid']}} bulk_create_action.extend([create_action, item]) count += 1 if count % 99 == 0: es_xnr.bulk(bulk_create_action, index=weibo_active_user_index_name, doc_type=weibo_active_user_index_type) bulk_create_action = [] if bulk_create_action: result = es_xnr.bulk(bulk_create_action, index=weibo_active_user_index_name, doc_type=weibo_active_user_index_type) if result['errors']: print result return False return True
def sensitive_func(index_name, ts): bulk_action = [] query_body = { 'query': { 'match_all': {} }, 'size': 999, } res = es.search(index=index_name, doc_type='text', body=query_body)['hits']['hits'] for r in res: _id = r['_id'] uid = r['_source']['uid'] mid = '' if r['_source'].has_key('mid'): mid = r['_source']['mid'] text = '' if r['_source'].has_key('text'): text = r['_source']['text'] sensitive_info = get_sensitive_info(ts, mid, text) sensitive_user = get_sensitive_user(ts, uid) item = { 'sensitive_info': sensitive_info, 'sensitive_user': sensitive_user, } action = {'update': {'_id': _id}} bulk_action.extend([action, {'doc': item}]) if bulk_action: print es.bulk(bulk_action, index=index_name, doc_type='text', timeout=600)
def save_data2es(data): update_uid_list = [] create_uid_list = [] try: for uid, d in data.items(): if es.exists(index=fb_portrait_index_name, doc_type=fb_portrait_index_type, id=uid): update_uid_list.append(uid) else: create_uid_list.append(uid) #bulk create bulk_create_action = [] if create_uid_list: for uid in create_uid_list: create_action = {'index':{'_id': uid}} bulk_create_action.extend([create_action, data[uid]]) result = es.bulk(bulk_create_action, index=fb_portrait_index_name, doc_type=fb_portrait_index_type) if result['errors'] : print result return False #bulk update if update_uid_list: bulk_update_action = [] for uid in update_uid_list: update_action = {'update':{'_id': uid}} bulk_update_action.extend([update_action, {'doc': data[uid]}]) result = es.bulk(bulk_update_action, index=fb_portrait_index_name, doc_type=fb_portrait_index_type) if result['errors'] : print result return False except Exception,e: print e return False
def Facebook_count_mappings(): import time db = MySQLdb.connect(host="localhost",user="******",passwd="",db="db_F",charset='utf8') cursor = db.cursor() cursor.execute("SELECT * from Fscouting") rows = cursor.fetchall() bulk_action_all = {} count_dict = {} count_i = 0 for row in rows: timestamp = int(time.mktime(row[5].timetuple())) _id = row[1] + '_' + str(timestamp) action = {'index':{'_id':_id}} source_item = {} source_item['uid'] = row[1].split('_')[0] source_item['fid'] = row[1].split('_')[1] source_item['share'] = row[2] source_item['comment'] = row[3] source_item['favorite'] = row[4] source_item['update_time'] = timestamp date_time = ts2datetime(timestamp) try: bulk_action_all[date_time].extend([action,source_item]) count_dict[date_time] += 1 count_i += 1 except: bulk_action_all[date_time] = [action,source_item] count_dict[date_time] = 1 count_i += 1 for date, count in count_dict.iteritems(): if count % 1000 == 0: index_name = facebook_count_index_name_pre + date #if not es.indices.exists(index=index_name): facebook_count_mappings(index_name) if bulk_action_all[date]: es.bulk(bulk_action_all[date],index=index_name,doc_type=facebook_flow_text_index_type,\ timeout=600) bulk_action_all[date] = [] if count_i % 1000 == 0: print count_i for date, bulk_action in bulk_action_all.iteritems(): if bulk_action: index_name = facebook_count_index_name_pre + date #if not es.indices.exists(index=index_name): facebook_count_mappings(index_name) es.bulk(bulk_action_all[date],index=index_name,doc_type=facebook_flow_text_index_type,timeout=600)
def update_hidden_expression(): res = [] with open('hidden_expression.json') as f: res = json.loads(f.read()) index = 0 bulk_action = [] for cdr in res: index += 1 action = {"index": {"_id": cdr['_id']}} bulk_action.extend([action, cdr['_source']]) if index % 1000 == 0: es.bulk(bulk_action, index=weibo_hidden_expression_index_name, doc_type=weibo_hidden_expression_index_type) bulk_action = [] if bulk_action: es.bulk(bulk_action, index=weibo_hidden_expression_index_name, doc_type=weibo_hidden_expression_index_type) print 'finish insert'
def update_groupmessage(): res = [] with open('group_message_2018-03-07.json') as f: res = json.loads(f.read()) index = 0 bulk_action = [] for cdr in res: index += 1 action = {"index": {"_id": cdr['_id']}} bulk_action.extend([action, cdr['_source']]) if index % 100 == 0: es.bulk(bulk_action, index='group_message_2018-03-07', doc_type='record') bulk_action = [] if bulk_action: es.bulk(bulk_action, index='group_message_2018-03-07', doc_type='record') print 'finish insert'
def social_sensing(): all_fid_list, end_ts = count_statis() if S_TYPE == 'test': all_fid_list = ALL_FID_LIST index_list = [] for i in range(7): timestamp = end_ts - i * DAY flow_text_index_name = flow_text_index_name_pre + ts2datetime( timestamp) index_list.append(flow_text_index_name) #index_list = [flow_text_index_name_pre+date_1,flow_text_index_name_pre+date_2] print 'index_list...', index_list # 感知到的事, all_fid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 fid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] classify_fid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} all_text_dict = dict() fid_ts_dict = dict() # 文本发布时间 # 有事件发生时开始 #if 1: if index_list and all_fid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "fid": all_fid_list } } } }, "size": 5000 } search_results = es.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search fid len: ", len(search_results) if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_fid = item['_source']['fid'] fid_ts_dict[iter_fid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_fid] = tmp_text duplicate_text_list.append({ "_id": iter_fid, "title": "", "content": iter_text.decode("utf-8", 'ignore') }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_fid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_fid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_fid_list.append(iter_fid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" fid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_fid_list, classify_text_dict) #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # fid:value #fid_value[k] = topic_value_dict[v[0]] fid_value[k] = v[0] # organize data fid_list = all_text_dict.keys() print "final fid:", len(fid_list) print "intersection: ", len(set(fid_list) & set(all_fid_list)) bulk_action = [] count = 0 #social_sensing_index_name = "fb_social_sensing_text_" + ts2datetime(end_ts) social_sensing_index_name = "fb_social_sensing_text" mappings_social_sensing_text(social_sensing_index_name) for fid in fid_list: iter_dict = dict() if duplicate_dict.has_key(fid): iter_dict["duplicate"] = duplicate_dict[fid] else: iter_dict["duplicate"] = "" iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = fid_value[fid] iter_dict["detect_ts"] = end_ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[fid]) count += 1 print 'iter_dict:::', iter_dict # _id = xnr_user_no + '_' + fid bulk_action.extend([{"index": {"_id": fid}}, iter_dict]) if count % 500 == 0: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) bulk_action = [] if bulk_action: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) return "1"
def tw_flow_text(): start_ts = datetime2ts('2017-10-10') end_ts = datetime2ts('2017-10-25') day_num = (end_ts - start_ts) / (24 * 3600) + 1 count = 0 for i in range(day_num): timestamp = start_ts + i * 24 * 3600 date = ts2datetime(timestamp) index_name = twitter_flow_text_index_name_pre + date query_body = {'query': {'match_all': {}}} scan_results = scan(es, index=index_name, doc_type=twitter_flow_text_index_type, query=query_body, size=1000) bulk_action = [] while 1: try: body_dict = {} data = scan_results.next() item = data['_source'] body_dict['comment'] = 0 body_dict['favorite'] = 0 body_dict['share'] = 0 body_dict['update_time'] = item['timestamp'] #flow_text_index_name = twitter_flow_text_index_name_pre + ts2datetime(item['timestamp']) _id = item['tid'] action = {'update': {'_id': _id}} bulk_action.extend([action, {'doc': body_dict}]) count += 1 if count % 100 == 0: print 'tw..', count es.bulk(bulk_action, index=index_name, doc_type=twitter_flow_text_index_type, timeout=100) except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=twitter_flow_text_index_type, timeout=100)
def match_flow_text(): current_time = int(time.time()) current_date = ts2datetime(current_time) new_xnr_flow_text_index_name = new_xnr_flow_text_index_name_pre + current_date new_weibo_xnr_flow_text_mappings(new_xnr_flow_text_index_name) #xnr_flow_text_index_name = xnr_flow_text_index_name_pre + current_date flow_text_index_name = flow_text_index_name_pre + current_date query_body = {'query': {'match_all': {}}, 'size': MAX_VALUE} try: search_results = es_xnr.search(index=weibo_xnr_index_name,doc_type=weibo_xnr_index_type,\ body=query_body)['hits']['hits'] bulk_action = [] count = 0 for result in search_results: uid = result['uid'] xnr_user_no = result['xnr_user_no'] match_query_body = { 'bool': { 'must': [{ 'term': { 'uid': uid } }] }, 'size': MAX_VALUE } match_results = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=match_query_body)['hits']['hits'] for match_item in match_results: match_item = match_item['_source'] keyword_dict = match_item['keywords_dict'] mid = match_item['mid'] keywords_dict = json.loads(keyword_dict) personal_keywords_dict = dict() classify_text_dict = dict() # 分类文本 mid_value = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[mid] = personal_keywords_dict if classify_text_dict: classify_results = topic_classfiy([mid], classify_text_dict) for k, v in classify_results.iteritems(): # mid:value mid_value[k] = v match_item["topic_field_first"] = topic_en2ch_dict( mid_value[mid][0]) match_item["topic_field"] = '&'.join(mid_value[mid]) match_item['xnr_user_no'] = xnr_user_no action = {'index': {'_id': mid}} source = match_item bulk_action.extend([action, source]) count += 1 if count % 1000 == 0: es_xnr.bulk(bulk_action, index=xnr_flow_text_index_name, doc_type=xnr_flow_text_index_type, timeout=600) if bulk_action: es_xnr.bulk(bulk_action, index=xnr_flow_text_index_name, doc_type=xnr_flow_text_index_type, timeout=600) except: return 'no tweets to update today'
def save_user_results(bulk_action): print es_xnr.bulk(bulk_action, index=qa_corpus_index_name, doc_type=qa_corpus_index_type, timeout=600) return 'True'
def social_sensing(task_detail): ''' with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) ''' # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] #ts = int(task_detail[2]) ts = float(task_detail[2]) #xnr_user_no = task_detail[3] print ts2date(ts) index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if es_text.indices.exists(index=flow_text_index_name_pre+ts2datetime(ts-2*DAY)): index_list.append(flow_text_index_name_pre+ts2datetime(ts-2*DAY)) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list, forward_1 = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list, current_3 = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type #statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count retweeted_weibo_detail[mid] = tmp #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) #current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 #current_retweeted_count = statistics_count['retweeted'] #current_comment_count = statistics_count['comment'] #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100])) # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} trendline_dict = dict() all_text_dict = dict() # 有事件发生时开始 if 1: print "index_list:", index_list if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 #classify_uid_list = [] classify_mid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() mid_ts_dict = dict() # 文本发布时间 uid_prediction_dict = dict() weibo_prediction_dict = dict() trendline_dict = dict() feature_prediction_list = [] # feature mid_prediction_list = [] # dui ying mid if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] mid_ts_dict[iter_mid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_mid_list.append(iter_mid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" mid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_mid_list, classify_text_dict) #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value #mid_value[k] = topic_value_dict[v[0]] mid_value[k]=v[0] #feature_list = organize_feature(k, mid_ts_dict[k]) #feature_prediction_list.append(feature_list) # feature list #mid_prediction_list.append(k) # corresponding # prediction """ print "start prediction" weibo_prediction_result = weibo_model.predict(feature_prediction_list) uid_prediction_result = uid_model.predict(feature_prediction_list) for i in range(len(mid_prediction_list)): if i % 100 == 0: print i uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i] weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]]) trendline_dict[mid_prediction_list[i]] = tmp_trendline """ # organize data mid_list = all_text_dict.keys() print "final mid:", len(mid_list) print "intersection: ", len(set(mid_list)&set(all_mid_list)) bulk_action = [] count = 0 for mid in mid_list: iter_dict = dict() if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 3 else: iter_dict["retweeted"] = 0 iter_dict["comment"] = 0 print "mid in all_mid_list: ", mid in set(all_mid_list) #iter_dict["trendline"] = json.dumps(trendline_dict[mid]) if duplicate_dict.has_key(mid): iter_dict["duplicate"] = duplicate_dict[mid] else: iter_dict["duplicate"] = "" #iter_dict["uid_prediction"] = uid_prediction_dict[mid] #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid] iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = mid_value[mid] iter_dict["detect_ts"] = ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[mid]) count += 1 #print 'iter_dict:::',iter_dict # _id = xnr_user_no + '_' + mid _id = mid bulk_action.extend([{"index":{"_id": _id}}, iter_dict]) if count % 500 == 0: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) bulk_action = [] if bulk_action: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) return "1"
def scan_retweet(ft_type): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) if ft_type == 'fb': retweet_redis_dict = fb_retweet_dict index_name = 'fb_be_retweet_' + str(db_number) else: retweet_redis_dict = tw_retweet_dict index_name = 'tw_be_retweet_' + str(db_number) #get redis db retweet_redis = retweet_redis_dict[str(db_number)] # # 1. 判断即将切换的db中是否有数据 # while 1: # redis_host_list.pop(db_number) # other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis # current_dbsize = other_db_number.dbsize() # if current_dbsize: # break # 已经开始写入新的db,说明前一天的数据已经写完 # else: # time.sleep(60) # 2. 删除之前的es be_retweet_es_mappings(str(db_number), ft_type) # 3. scan retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count #retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] ''' if re_scan_cursor == 0: print 'scan finish' if retweet_bulk_action != []: es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action != []: es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user') break ''' for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list) == 3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) print 'be_retweet_bulk_action...', be_retweet_bulk_action es.bulk(be_retweet_bulk_action, index=index_name, doc_type='user') retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user:'******'count:', count # flushdb retweet_redis.flushdb() print 'end'
def input_hashtag(date_time, type_ft): query_body = {'query': {'match_all': {}}} if type_ft == 'fb': index_name_pre = facebook_flow_text_index_name_pre else: index_name_pre = twitter_flow_text_index_name_pre index_name = index_name_pre + date_time # results = es.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] # print 'results...',results es_scan = scan(es,index=index_name,doc_type=flow_text_index_type,\ query=query_body,size=1000) bulk_action = [] count = 0 while 1: try: data = es_scan.next() _id = data['_id'] item = data['_source'] text = item['text'] action = {'update': {'_id': _id}} if isinstance(text, str): text = text.decode('utf-8', 'ignore') #RE = re.compile(u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) #hashtag_list = RE.findall(text) RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) else: hashtag = '' bulk_action.extend([{ 'update': { '_id': _id } }, { 'doc': { 'hashtag': hashtag } }]) if count % 1000 == 0 and count != 0: es.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600) bulk_action = [] print count count += 1 except StopIteration: break if bulk_action: print es.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600)
def influence_cal_tw(current_time): # if S_TYPE == 'test' : # current_time = datetime2ts(S_DATE_FB) current_date = ts2datetime(current_time) current_time = datetime2ts(current_date) flow_text_index_name = twitter_flow_text_index_name_pre + current_date count_index_name = twitter_count_index_name_pre + current_date tw_bci_index_name = tw_bci_index_name_pre + current_date tw_bci_mappings(tw_bci_index_name) uid_tid_dict = {} bulk_action = [] # 找出要计算活跃的uids -- 流数据 query_body_text = { 'query':{ 'match_all':{} } } es_scan_result = scan(es,index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\ query=query_body_text,size=1000) #print 'es_scan_result...',es_scan_result while 1: try: scan_data = es_scan_result.next() item = scan_data['_source'] uid = item['uid'] tid = item['tid'] try: uid_tid_dict[uid].append(tid) except: uid_tid_dict[uid] = [tid] except StopIteration: break # # # query_body_count = { # 'query':{ # 'match_all':{} # } # } # es_scan_result_count = scan(es,index=count_index_name,doc_type=twitter_count_index_type,\ # size=1000,query=query_body_count) # while 1: # try: # scan_data_count = es_scan_result_count.next() # item = scan_data_count['_source'] # tid = item['tid'] # uid = item['uid'] # try: # if tid in uid_tid_dict[uid]: # continue # else: # uid_tid_dict[uid].append(tid) # except: # uid_tid_dict[uid] = [tid] # except StopIteration: # break count = 0 for uid, tid_list in uid_tid_dict.iteritems(): # 活跃度 -- 每天活跃数 active_num = influence_active(uid,flow_text_index_name) #active_num = len(tid_list) # 传播力 -- 每天收到反馈数 propagate_num_sum = 1 for tid in tid_list: propagate_num = influence_propagate(tid,count_index_name) propagate_num_sum += propagate_num # 覆盖度 -- 活跃粉丝数 cover_num = influence_cover(uid,flow_text_index_name) # 可信度 -- category(团体组织)是否为空,不为空则1,为空则0。 trust_num = influence_trust(uid) # print 'propagate_num_sum...',propagate_num_sum # print 'cover_num...',cover_num influence_mark = (active_num + math.log10(propagate_num_sum) + math.log10(cover_num) + trust_num) * 10 action = {'index':{'_id':uid}} user_items = {} user_items['active'] = active_num user_items['propagate'] = propagate_num_sum user_items['cover'] = cover_num user_items['trust'] = trust_num user_items['influence'] = influence_mark user_items['uid'] = uid user_items['timestamp'] = current_time bulk_action.extend([action,user_items]) count += 1 if count % 1000: es.bulk(bulk_action,index=tw_bci_index_name,doc_type=tw_bci_index_type,timeout=400) bulk_action = [] if bulk_action: es.bulk(bulk_action,index=tw_bci_index_name,doc_type=tw_bci_index_type,timeout=400)
def Twitter_user_mappings(rows): # db = MySQLdb.connect(host="localhost",user="******",passwd="",db="db_F",charset='utf8') # cursor = db.cursor() # cursor.execute("SELECT * from TIDscouting") # rows = cursor.fetchall() bulk_action = [] count = 0 for row in rows: # print '!!!!',row #print 'create___at....',row[5] action = {'index':{'_id':row[1]}} source_item = {} source_item['uid'] = str(row[1]) source_item['username'] = row[2].decode("utf-8", "replace") source_item['userscreenname'] = row[3].decode("utf-8", "replace") source_item['description'] = row[4].decode("utf-8", "replace") source_item['create_at'] = int(time.mktime(row[5].timetuple())) source_item['url'] = row[6] source_item['profile_image_url'] = row[7] source_item['profile_background_image_url'] = row[8] source_item['location'] = row[9].decode("utf-8", "replace") source_item['timezone'] = row[10] source_item['access_level'] = row[11] source_item['status_count'] = row[12] source_item['followers_count'] = row[13] source_item['friends_count'] = row[14] source_item['favourites_count'] = row[15] source_item['listed_count'] = row[16] source_item['is_protected'] = row[17] source_item['is_geo_enabled'] = row[18] source_item['is_show_all_inline_media'] = row[19] source_item['is_contributors_enable'] = row[20] source_item['is_follow_requestsent'] = row[21] source_item['is_profile_background_tiled'] = row[22] source_item['is_profile_use_background_image'] = row[23] source_item['is_translator'] = row[24] source_item['is_verified'] = row[25] source_item['utcoffset'] = row[26] source_item['lang'] = row[27] source_item['bigger_profile_image_url'] = row[28] source_item['bigger_profile_image_url_https'] = row[29] source_item['mini_profile_image_url'] = row[30] source_item['mini_profile_image_url_https'] = row[31] source_item['original_profile_image_url'] = row[32] source_item['original_profile_image_url_https'] = row[33] source_item['profile_background_image_url_https'] = row[34] source_item['profile_banner_ipad_url'] = row[35] source_item['profile_banner_ipad_retina_url'] = row[36] source_item['profile_banner_mobile_url'] = row[37] source_item['profile_banner_mobile_retina_url'] = row[38] source_item['profile_banner_retina_url'] = row[39] source_item['profile_banner_url'] = row[40] source_item['profile_image_url_https'] = row[41] source_item['update_time'] = int(time.mktime(row[42].timetuple())) source_item['sensitivity'] = row[43] source_item['sensitivity2'] = row[44] bulk_action.extend([action,source_item]) #print 'bulk_action...',bulk_action count += 1 if count % 1000 == 0: es.bulk(bulk_action,index='twitter_user',doc_type='user',timeout=600) bulk_action = [] print count if bulk_action: print '@@',es.bulk(bulk_action,index='twitter_user',doc_type='user',timeout=600)
def save_user_results(bulk_action): print es_xnr.bulk(bulk_action, index=all_opinion_corpus_index_name_test, doc_type=all_opinion_corpus_index_type, timeout=600) return 'True'
word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] #should_index_name_date = ts2datetime(weibo_timestamp) # if should_index_name_date != now_index_name_date: if action != [] and xdata != []: #index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 # now_index_name_date = should_index_name_date # index_name = index_name_pre + now_index_name_date #twitter_flow_text_mappings(index_name) # save action, xdata = expand_index_action(item) bulk_action.extend([action, xdata]) count += 1 if count % 1000 == 0 and count != 0: #index_name = index_name_pre + now_index_name_date if bulk_action:
def Facebook_user_mappings(): import format #from format import load_json_text path_file = path + '/' + 'user.json' bulk_action = [] count = 0 #keys_dif = ['uid','update_time',] # 'id' 'updated_time' dict_key_list = ['hometown','location','education','category_list',\ 'favorite_athletes','interested_in','work','parking', 'inspirational_people','languages','cover','favorite_teams'] with open(path_file,'r') as f: for line in f: line = json.loads(line) del line['_id'] keys_list = line.keys() action = {'index':{'_id':line['id']}} source_item = {} source_item['uid'] = line['id'] del line['id'] if 'founded' in keys_list: try: date_time = int(line['founded']) time_string = str(date_time) + '-01-01 00:00:00' timestamp = int(time.mktime(time.strptime(time_string,"%Y-%m-%d %H:%M:%S"))) source_item['founded'] = timestamp except: try: if u'年' in line['founded']: import re time_list = re.split(r'[^\u4e00-\u9fa5]',line['founded']) year = str(time_list[0]) month = str(time_list[1]).zfill(2) day = str(time_list[2]).zfill(2) date = '-'.join([year,month,day]) time_string = date+ ' 00:00:00' timestamp = int(time.mktime(time.strptime(time_string,"%Y-%m-%d %H:%M:%S"))) source_item['founded'] = timestamp else: import re time_list = re.split(r'[^\u4e00-\u9fa5]',line['founded']) month = str(time_list[0]).zfill(2) day = str(time_list[1]).zfill(2) year = str(time_list[2]) date = '-'.join([year,month,day]) time_string = date+ ' 00:00:00' timestamp = int(time.mktime(time.strptime(time_string,"%Y-%m-%d %H:%M:%S"))) source_item['founded'] = timestamp except: source_item['founded'] = 0 del line['founded'] for dict_key in dict_key_list: if dict_key in keys_list: source_item[dict_key] = json.dumps(line[dict_key]) del line[dict_key] if 'updated_time' in keys_list: time_string = line['updated_time'][:10] + ' '+line['updated_time'][11:19] source_item['update_time'] = int(time.mktime(time.strptime(time_string,"%Y-%m-%d %H:%M:%S"))) del line['updated_time'] if 'birthday' in keys_list: birthday_list = line['birthday'].split('/') try: source_item['birthday'] = '-'.join(birthday_list[:2]) except: source_item['birthday'] = ' ' if len(birthday_list) == 3: source_item['birthyear'] = int(birthday_list[2]) else: source_item['birthyear'] = 0 del line['birthday'] keys_list_new = line.keys() for key_item in keys_list_new: source_item[key_item] = line[key_item] bulk_action.extend([action,source_item]) count += 1 if count % 1000 == 0: es.bulk(bulk_action,index='facebook_user',doc_type='user',timeout=600) bulk_action = [] print count if bulk_action: es.bulk(bulk_action,index='facebook_user',doc_type='user',timeout=600)