def save_data2es(data): update_uid_list = [] create_uid_list = [] try: for uid, d in data.items(): if es.exists(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, id=uid): update_uid_list.append(uid) else: create_uid_list.append(uid) #bulk create bulk_create_action = [] if create_uid_list: for uid in create_uid_list: create_action = {'index':{'_id': uid}} bulk_create_action.extend([create_action, data[uid]]) result = es.bulk(bulk_create_action, index=tw_portrait_index_name, doc_type=tw_portrait_index_type) if result['errors'] : print result return False #bulk update if update_uid_list: bulk_update_action = [] for uid in update_uid_list: update_action = {'update':{'_id': uid}} bulk_update_action.extend([update_action, {'doc': data[uid]}]) result = es.bulk(bulk_update_action, index=tw_portrait_index_name, doc_type=tw_portrait_index_type) if result['errors'] : print result return False except Exception,e: print e return False
def my_bulk_func(bulk_action, index_name, doc_type): # bulk_action: [action,source_item,action,source_item,...] try: es.bulk(bulk_action,index=index_name,doc_type=doc_type,timeout=600) except Exception,e: #如果出现错误,就减小存储的批次,再次出现问题的批次直接放弃 # print 'my_bulk_func Exception: ', str(e) for i in range(len(bulk_action)/2): temp_bulk_action = bulk_action[2*i : 2*i+2] try: es.bulk(temp_bulk_action,index=index_name,doc_type=doc_type,timeout=600) except: pass
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e
def update_chinese_user(index_name_pre, index_type, user_index_name, user_index_type, date_list): #user表中的chinese_user字段为1,代表是中文用户 chinese_user_list = load_chinese_user(user_index_name, user_index_type) all_user_list = load_chinese_user(user_index_name, user_index_type, { 'query': { 'match_all': {}, }, 'size': 99999, }) unchinese_user_list = list(set(all_user_list) - set(chinese_user_list)) #需要进行update操作(更新chinese_user为1或0)的user_list update_list = [] #流数据表 flow_text_index_list = [] for date in date_list: flow_text_index_list.append(index_name_pre + date) #对于chinese_user不为1的用户 #当某个uid对应的所有flow_text中,中文帖子的比例占到一定数量,并且帖子数量大于一个阈值的时候,更新为chinese_user=1 for uid in unchinese_user_list: text_num = count_flow_text_num(uid, flow_text_index_list, index_type) if text_num >= 20: text_num_ch = count_flow_text_num(uid, flow_text_index_list, index_type, True) if float(text_num_ch) / text_num > 0.9: update_list.append((uid, 1)) #对于chinese_user为1的用户,有选择的进行操作:更新为chinese_user=0 for uid in chinese_user_list: text_num = count_flow_text_num(uid, flow_text_index_list, index_type) if text_num: text_num += 1 #以防分母为0 text_num_ch = count_flow_text_num(uid, flow_text_index_list, index_type, True) if float(text_num_ch) / text_num < 0.9: #可以不发帖,但是如果发帖,中文帖子比例一定要高 update_list.append((uid, 0)) #统一进行update操作 if update_list: bulk_update_action = [] count = 0 for uid, flag in update_list: update_action = {'update': {'_id': uid}} bulk_update_action.extend( [update_action, { 'doc': { 'chinese_user': flag } }]) count += 1 if count % 1000 == 0: es.bulk(bulk_update_action, index=user_index_name, doc_type=user_index_type) bulk_update_action = [] if bulk_update_action: es.bulk(bulk_update_action, index=user_index_name, doc_type=user_index_type) return True
# -*- coding: utf-8 -*- import redis import sys sys.path.append('/home/xnr1/xnr_0429/xnr') from global_utils import es_xnr_2 as es r_ali = redis.Redis(host='60.205.190.67', port=6379, db=0) r = r_ali redis_task_es = 'redis_task_es' if r_ali.llen(redis_task_es): data = r.rpop(redis_task_es) if data: data = eval(data) bulk_action = data['bulk_action'] index_name = data['index_name'] doc_type = data['doc_type'] try: print es.bulk(bulk_action,index=index_name,doc_type=doc_type,timeout=600) except Exception,e: #如果出现错误,就减小存储的批次,再次出现问题的批次直接放弃 print 'my_bulk_func Exception: ', str(e) for i in range(len(bulk_action)/2): temp_bulk_action = bulk_action[2*i : 2*i+2] try: es.bulk(temp_bulk_action,index=index_name,doc_type=doc_type,timeout=600) except: pass
def influence_cal_tw(current_time): # if S_TYPE == 'test' : # current_time = datetime2ts(S_DATE_FB) current_date = ts2datetime(current_time) current_time = datetime2ts(current_date) flow_text_index_name = twitter_flow_text_index_name_pre + current_date count_index_name = twitter_count_index_name_pre + current_date tw_bci_index_name = tw_bci_index_name_pre + current_date tw_bci_mappings(tw_bci_index_name) uid_tid_dict = {} bulk_action = [] # 找出要计算活跃的uids -- 流数据 query_body_text = { 'query':{ 'match_all':{} } } es_scan_result = scan(es,index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\ query=query_body_text,size=1000) #print 'es_scan_result...',es_scan_result while 1: try: scan_data = es_scan_result.next() item = scan_data['_source'] uid = item['uid'] tid = item['tid'] try: uid_tid_dict[uid].append(tid) except: uid_tid_dict[uid] = [tid] except StopIteration: break # # # query_body_count = { # 'query':{ # 'match_all':{} # } # } # es_scan_result_count = scan(es,index=count_index_name,doc_type=twitter_count_index_type,\ # size=1000,query=query_body_count) # while 1: # try: # scan_data_count = es_scan_result_count.next() # item = scan_data_count['_source'] # tid = item['tid'] # uid = item['uid'] # try: # if tid in uid_tid_dict[uid]: # continue # else: # uid_tid_dict[uid].append(tid) # except: # uid_tid_dict[uid] = [tid] # except StopIteration: # break count = 0 for uid, tid_list in uid_tid_dict.iteritems(): # 活跃度 -- 每天活跃数 active_num = influence_active(uid,flow_text_index_name) #active_num = len(tid_list) # 传播力 -- 每天收到反馈数 propagate_num_sum = 1 for tid in tid_list: propagate_num = influence_propagate(tid,count_index_name) propagate_num_sum += propagate_num # 覆盖度 -- 活跃粉丝数 cover_num = influence_cover(uid,flow_text_index_name) # 可信度 -- category(团体组织)是否为空,不为空则1,为空则0。 trust_num = influence_trust(uid) # print 'propagate_num_sum...',propagate_num_sum # print 'cover_num...',cover_num influence_mark = (active_num + math.log10(propagate_num_sum) + math.log10(cover_num) + trust_num) * 10 action = {'index':{'_id':uid}} user_items = {} user_items['active'] = active_num user_items['propagate'] = propagate_num_sum user_items['cover'] = cover_num user_items['trust'] = trust_num user_items['influence'] = influence_mark user_items['uid'] = uid user_items['timestamp'] = current_time bulk_action.extend([action,user_items]) count += 1 if count % 1000: es.bulk(bulk_action,index=tw_bci_index_name,doc_type=tw_bci_index_type,timeout=400) bulk_action = [] if bulk_action: es.bulk(bulk_action,index=tw_bci_index_name,doc_type=tw_bci_index_type,timeout=400)
def scan_retweet(ft_type): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) if ft_type == 'fb': retweet_redis_dict = fb_retweet_dict index_name = 'fb_be_retweet_' +str(db_number) else: retweet_redis_dict = tw_retweet_dict index_name = 'tw_be_retweet_' +str(db_number) #get redis db retweet_redis = retweet_redis_dict[str(db_number)] # # 1. 判断即将切换的db中是否有数据 # while 1: # redis_host_list.pop(db_number) # other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis # current_dbsize = other_db_number.dbsize() # if current_dbsize: # break # 已经开始写入新的db,说明前一天的数据已经写完 # else: # time.sleep(60) # 2. 删除之前的es be_retweet_es_mappings(str(db_number),ft_type) # 3. scan retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count #retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] ''' if re_scan_cursor == 0: print 'scan finish' if retweet_bulk_action != []: es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action != []: es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user') break ''' for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) #print 'be_retweet_bulk_action...',be_retweet_bulk_action if be_retweet_bulk_action: es.bulk(be_retweet_bulk_action, index=index_name, doc_type='user') else: break retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user:'******'count:', count # flushdb retweet_redis.flushdb() print 'end'