def co_search(es, user_list, bulk_action, count_n, tb): search_list = [] for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) search_result = es.mget(index=index_destination, doc_type=index_destination_doctype, body={"ids": search_list}, _source=False)["docs"] search_list = [] for item in search_result: if not item['found']: user_info = {} user_info['uid'] = item['_id'] user_info['low_number'] = 0 xdata = expand_index_action(user_info) bulk_action.extend([xdata[0], xdata[1]]) count_n += 1 if count_n % 1000 == 0: es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30) bulk_action = [] print count_n if count_n % 10000 == 0: ts = time.time() print "count_n %s per %s second" %(count_n, ts-tb) print "count %s " % count tb = ts return bulk_action, count_n, tb
def save_dg_pr_results(sorted_uids, es_num, flag): index_name = "user_portrait_network" index_type = "network" bulk_action = [] for uid, rank in sorted_uids: if (uid == 'global'): continue user_results = {} user_results['uid'] = uid user_results[flag+'_'+str(es_num)] = rank if es_num == 0: action = {'index':{'_id':uid}} bulk_action.extend([action,user_results]) else: try: item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=uid)['_source'] action = {'update':{'_id':uid}} try: pr_last = item_exist[flag+'_'+str(es_num-1)] except: pr_last = 0 user_results[flag+'_diff_'+str(es_num)] = rank - pr_last bulk_action.extend([action,{'doc':user_results}]) except: action = {'index':{'_id':uid}} pr_last = 0 user_results[flag+'_diff_'+str(es_num)] = rank - pr_last bulk_action.extend([action,user_results]) #print bulk_action es_user_portrait.bulk(bulk_action, index=index_name, doc_type=index_type)
def co_search(es, user_list, bulk_action, count_n, tb): search_list = [] for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) search_result = es.mget(index=index_destination, doc_type=index_destination_doctype, body={"ids": search_list}, _source=False)["docs"] search_list = [] for item in search_result: if not item['found']: user_info = {} user_info['uid'] = item['_id'] user_info['low_number'] = 0 xdata = expand_index_action(user_info) bulk_action.extend([xdata[0], xdata[1]]) count_n += 1 if count_n % 1000 == 0: es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30) bulk_action = [] print count_n if count_n % 10000 == 0: ts = time.time() print "count_n %s per %s second" % (count_n, ts - tb) print "count %s " % count tb = ts return bulk_action, count_n, tb
def save_user_results(bulk_action): print 'save utils bulk action len:', len(bulk_action) #print 'bulk action:', bulk_action es.bulk(bulk_action, index='sensitive_user_portrait', doc_type=index_type, timeout=60) return True
def save_user_results(bulk_action): #print 'save utils bulk action len:', len(bulk_action) #test #print 'bulk_action:', bulk_action #portrait_index_name = 'user_portrait_0303' #portrait_index_type = 'user' es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=600) return True
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( uid_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( uid_list) #compute attribute--keywords, topic, online_pattern #get user topic results by bulk action topic_results_dict, topic_results_label = topic_classfiy( uid_list, user_keywords_dict) #get bulk action bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid #add user topic attribute user_topic_dict = topic_results_dict[uid] user_label_dict = topic_results_label[uid] results['topic'] = json.dumps(user_topic_dict) results['topic_string'] = topic_en2ch(user_label_dict) #add user keywords attribute keywords_dict = user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results['keywords'] = json.dumps(keywords_top50) results['keywords_string'] = keywords_top50_string #add online_pattern user_online_pattern = online_pattern_dict[uid] results['online_pattern'] = json.dumps(user_online_pattern) try: results['online_pattern_aggs'] = '&'.join( user_online_pattern.keys()) except: results['online_pattern_aggs'] = '' #add user importance user_domain = user_info_list[uid]['domain'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importance'] = get_importance(user_domain, results['topic_string'], user_fansnum, fansnum_max) #bulk action action = {'update': {'_id': uid}} bulk_action.extend([action, {'doc': results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete #print '%s sec count %s' % (end_ts - start_ts, len(uid_list)) #log_should_delete start_ts = end_ts
def split_bulk_action(bulk_action, index_name): new_bulk_action = [] for i in range(0, len(bulk_action)): if i % 2 == 0: new_bulk_action = [bulk_action[i], bulk_action[i + 1]] try: es.bulk(new_bulk_action, index=index_name, doc_type='user') except: print 'cron/flow3/scan_redis2es_comment.py&error-1&'
def split_bulk_action(bulk_action, index_name): new_bulk_action = [] for i in range(0, len(bulk_action)): if i % 2 == 0: new_bulk_action = [bulk_action[i], bulk_action[i + 1]] try: es.bulk(new_bulk_action, index=index_name, doc_type="user") except: print "cron/flow3/scan_redis2es_comment.py&error-1&"
def split_bulk_action(bulk_action, index_name): new_bulk_action = [] for i in range(0, len(bulk_action)): if i % 2 == 0: new_bulk_action = [bulk_action[i], bulk_action[i + 1]] #print 'new_bulk_action:', new_bulk_action try: es.bulk(new_bulk_action, index=index_name, doc_type='user') except: error_f.writelines([new_bulk_action[0]['index']['_id'], '\n'])
def split_bulk_action(bulk_action, index_name): new_bulk_action = [] for i in range(0, len(bulk_action)): if i % 2 == 0: new_bulk_action = [bulk_action[i], bulk_action[i+1]] #print 'new_bulk_action:', new_bulk_action try: es.bulk(new_bulk_action, index=index_name, doc_type='user') except: error_f.writelines([new_bulk_action[0]['index']['_id'], '\n'])
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment( uid_list) else: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text( uid_list) #compute attribute--domain, character, importance #get user domain domain_results = domain_classfiy(uid_list, user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] #get user character character_start_time = ts2datetime(character_start_ts) character_end_time = ts2datetime(character_start_ts + DAY * CHARACTER_TIME_GAP - DAY) character_sentiment_result_dict = classify_sentiment( uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE) character_text_result_dict = classify_topic(uid_list, user_keywords_dict) bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid #add user domain attribute user_domain_dict = domain_results_dict[uid] user_label_dict = domain_results_label[uid] results['domain_v3'] = json.dumps(user_domain_dict) results['domain'] = domain_en2ch(user_label_dict) #add user character_sentiment attribute character_sentiment = character_sentiment_result_dict[uid] results['character_sentiment'] = character_sentiment #add user character_text attribute character_text = character_text_result_dict[uid] results['character_text'] = character_text #get user importance user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importnace'] = get_importance(results['domain'], user_topic_string, user_fansnum, fansnum_max) #bulk action action = {'update': {'_id': uid}} bulk_action.extend([action, {'doc': results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
def save_user_results(bulk_action): #print 'save utils bulk action len:', len(bulk_action) #test #print 'bulk_action:', bulk_action #portrait_index_name = 'user_portrait_0303' #portrait_index_type = 'user' es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=600) return True
def save_bulk_action(uid_list, hashtag_results, geo_results, activeness_results, influence_results): bulk_action = [] for uid in uid_list: user_results = {} user_results = dict(user_results, **hashtag_results[uid]) user_results = dict(user_results, **geo_results[uid]) user_results = dict(user_results, **activeness_results[uid]) user_results = dict(user_results, **influence_results[uid]) action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': user_results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
def main(): ts = time.time() bulk_action = [] copy_bulk_action = [] count = 0 delete_key = ts2datetime(ts - 7*86400) # 每天删除7天前的数据 temp_list = recommend_redis.hget("decide_delete_list", item) if temp_list: delete_list = json.loads(temp_list) # 待删除的用户列表 hdel("decide_delete_list", item) for uid in delete_list: del_data = expand_delete_action(uid, portrait_index_name, portrait_index_type) copy_del_data = expand_delete_action(uid, copy_portrait_index_name, copy_portrait_index_type) bulk_action.append(del_data) copy_bulk_action.append(copy_del_data) count += 1 if count % 100 == 0: es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=30) es.bulk(copy_bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type, timeout=30) bulk_action = [] copy_bulk_action = [] if bulk_action: es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=30) es.bulk(copy_bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type, timeout=30)
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() # acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( uid_list ) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(uid_list) # compute attribute--keywords, topic, online_pattern # get user topic results by bulk action topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict) # get bulk action bulk_action = [] for uid in uid_list: results = {} results["uid"] = uid # add user topic attribute user_topic_dict = topic_results_dict[uid] user_label_dict = topic_results_label[uid] results["topic"] = json.dumps(user_topic_dict) results["topic_string"] = topic_en2ch(user_label_dict) # add user keywords attribute keywords_dict = user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50]) results["keywords"] = json.dumps(keywords_top50) results["keywords_string"] = keywords_top50_string # add online_pattern user_online_pattern = online_pattern_dict[uid] results["online_pattern"] = json.dumps(user_online_pattern) try: results["online_pattern_aggs"] = "&".join(user_online_pattern.keys()) except: results["online_pattern_aggs"] = "" # add user importance user_domain = user_info_list[uid]["domain"].encode("utf-8") user_fansnum = user_info_list[uid]["fansnum"] results["importance"] = get_importance(user_domain, results["topic_string"], user_fansnum, fansnum_max) # bulk action action = {"update": {"_id": uid}} bulk_action.extend([action, {"doc": results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() # log_should_delete # print '%s sec count %s' % (end_ts - start_ts, len(uid_list)) # log_should_delete start_ts = end_ts
def save_bulk_action(uid_list, hashtag_results, geo_results, activeness_results, influence_results): bulk_action = [] for uid in uid_list: user_results = {} user_results = dict(user_results, **hashtag_results[uid]) user_results = dict(user_results, **geo_results[uid]) user_results = dict(user_results, **activeness_results[uid]) user_results = dict(user_results, **influence_results[uid]) action = {'update': {'_id': uid}} bulk_action.extend([action, {'doc': user_results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
def save_bulk_action(uid_list, hashtag_results, geo_results, activeness_results, influence_results, sensitive_results, profile_results): bulk_action = [] for uid in uid_list: user_results = {} user_results = dict(user_results, **hashtag_results[uid]) user_results = dict(user_results, **geo_results[uid]) user_results = dict(user_results, **activeness_results[uid]) user_results = dict(user_results, **influence_results[uid]) user_results = dict(user_results, **sensitive_results[uid]) user_results = dict(user_results, **profile_results[uid]) #print 'user_results_sensitive:', user_results['sensitive'] action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': user_results}]) #print 'bulk_action:', bulk_action es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment(uid_list) else: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text(uid_list) #compute attribute--domain, character, importance #get user domain domain_results = domain_classfiy(uid_list, user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] #get user character character_end_time = ts2datetime(character_start_ts) character_start_time = ts2datetime(character_start_ts - DAY * CHARACTER_TIME_GAP) character_sentiment_result_dict = classify_sentiment(uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE) character_text_result_dict = classify_topic(uid_list, user_keywords_dict) bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid #add user domain attribute user_domain_dict = domain_results_dict[uid] user_label_dict = domain_results_label[uid] results['domain_v3'] = json.dumps(user_domain_dict) results['domain'] = domain_en2ch(user_label_dict) #add user character_sentiment attribute character_sentiment = character_sentiment_result_dict[uid] results['character_sentiment'] = character_sentiment #add user character_text attribute character_text = character_text_result_dict[uid] results['character_text'] = character_text #get user importance user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importnace'] = get_importance(results['domain'], user_topic_string, user_fansnum, fansnum_max) #bulk action action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
def save_dg_pr_results(sorted_uids, es_num, flag): index_name = "user_portrait_network" index_type = "network" bulk_action = [] count = 0 for uid, rank in sorted_uids: if (uid == 'global'): continue count += 1 user_results = {} user_results['uid'] = uid user_results[flag + '_' + str(es_num)] = rank user_results['rank_' + flag + '_' + str(es_num)] = count #rank if es_num == 0: action = {'index': {'_id': uid}} bulk_action.extend([action, user_results]) else: try: item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=uid)['_source'] action = {'update': {'_id': uid}} try: pr_last = item_exist[flag + '_' + str(es_num - 1)] rank_last = item_exist['rank_' + flag + '_' + str(es_num - 1)] except: pr_last = 0 rank_last = 101 user_results[flag + '_diff_' + str(es_num)] = rank - pr_last user_results['rank_' + flag + '_diff_' + str(es_num)] = abs(count - rank_last) bulk_action.extend([action, {'doc': user_results}]) except: action = {'index': {'_id': uid}} pr_last = 0 rank_last = 101 user_results[flag + '_diff_' + str(es_num)] = rank - pr_last user_results['rank_' + flag + '_diff_' + str(es_num)] = abs(count - rank_last) bulk_action.extend([action, user_results]) #print bulk_action es_user_portrait.bulk(bulk_action, index=index_name, doc_type=index_type)
def save_bulk_action(uid_list, hashtag_results, geo_results, activeness_results, influence_results, sensitive_results, profile_results): bulk_action = [] for uid in uid_list: user_results = {} user_results = dict(user_results, **hashtag_results[uid]) user_results = dict(user_results, **geo_results[uid]) user_results = dict(user_results, **activeness_results[uid]) user_results = dict(user_results, **influence_results[uid]) user_results = dict(user_results, **sensitive_results[uid]) user_results = dict(user_results, **profile_results[uid]) #print 'user_results_sensitive:', user_results['sensitive'] action = {'update': {'_id': uid}} bulk_action.extend([action, {'doc': user_results}]) #print 'bulk_action:', bulk_action es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
def main(): ts = time.time() bulk_action = [] copy_bulk_action = [] count = 0 delete_key = ts2datetime(ts - 7 * 86400) # 每天删除7天前的数据 temp_list = recommend_redis.hget("decide_delete_list", delete_key) if temp_list: delete_list = json.loads(temp_list) # 待删除的用户列表 hdel("decide_delete_list", delete_key) for uid in delete_list: del_data = expand_delete_action(uid, portrait_index_name, portrait_index_type) copy_del_data = expand_delete_action(uid, copy_portrait_index_name, copy_portrait_index_type) bulk_action.append(del_data) copy_bulk_action.append(copy_del_data) count += 1 if count % 100 == 0: es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=30) es.bulk(copy_bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type, timeout=30) bulk_action = [] copy_bulk_action = [] if bulk_action: es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=30) es.bulk(copy_bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type, timeout=30)
def scan_comment(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #comment/be_comment es mappings ''' comment_es_mappings(str(db_number)) be_comment_es_mappings(str(db_number)) ''' #get redis db comment_redis = comment_redis_dict[str(db_number)] comment_bulk_action = [] be_comment_bulk_action = [] start_ts = time.time() #comment count/be_comment count comment_count = 0 be_comment_count = 0 while True: re_scan = comment_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==2: comment_count += 1 uid = item_list[1] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_comment'] = json.dumps(item_result) comment_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) ''' elif len(item_list)==3: be_comment_count += 1 uid = item_list[2] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_comment'] = json.dumps(item_result) be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict]) ''' try: es.bulk(comment_bulk_action, index='1225_comment_'+str(db_number), doc_type='user') except: index_name = '1225_comment_'+str(db_number) split_bulk_action(comment_bulk_action, index_name) ''' try: es.bulk(be_comment_bulk_action, index='1225_be_comment_'+str(db_number), doc_type='user') except: index_name = '1225_be_comment_'+str(db_number) split_bulk_action(be_comment_bulk_action, index_name) ''' comment_bulk_action = [] #be_comment_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user' % (end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor==0: break print 'count:', count print 'end'
def scan_comment(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #comment/be_comment es mappings #get redis db comment_redis = sensitive_comment_redis_dict[str(db_number)] """ # 1. 判断即将切换的db中是否有数据 sensitive_redis_host_list.remove(str(db_number)) while 1: other_db_number = comment_redis_dict[redis_host_list[0]] current_dbsize = other_db_number.dbsize() if current_dbsize: break # 已经开始写入新的db,说明前一天的数据已经写完 else: time.sleep(60) """ # 2. 删除之前的es comment_es_mappings(str(db_number)) be_comment_es_mappings(str(db_number)) # 3. scan comment_bulk_action = [] be_comment_bulk_action = [] start_ts = time.time() #comment count/be_comment count comment_count = 0 be_comment_count = 0 while True: re_scan = comment_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==2: comment_count += 1 uid = item_list[1] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_comment'] = json.dumps(item_result) comment_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) elif len(item_list)==3: be_comment_count += 1 uid = item_list[2] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_comment'] = json.dumps(item_result) be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict]) #try: if comment_bulk_action: es.bulk(comment_bulk_action, index=sensitive_comment_index_name_pre+str(db_number), doc_type='user') #except: # index_name = '1225_comment_'+str(db_number) # split_bulk_action(comment_bulk_action, index_name) #try: if be_comment_bulk_action: es.bulk(be_comment_bulk_action, index=sensitive_be_comment_index_name_pre+str(db_number), doc_type='user') #except: # index_name = '1225_be_comment_'+str(db_number) # split_bulk_action(be_comment_bulk_action, index_name) comment_bulk_action = [] be_comment_bulk_action = [] end_ts = time.time() #run_type #if RUN_TYPE == 1: print '%s sec scan %s count user' % (end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor==0: break
def scan_retweet(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #get redis db retweet_redis = retweet_redis_dict[str(db_number)] """ # 1. 判断即将切换的db中是否有数据 while 1: redis_host_list.pop(str(db_number)) other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis current_dbsize = other_db_number.dbsize() if current_dbsize: break # 已经开始写入新的db,说明前一天的数据已经写完 else: time.sleep(60) """ # 2. 删除之前的es retweet_es_mappings(str(db_number)) be_retweet_es_mappings(str(db_number)) # 3. scan retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==2: retweet_count += 1 uid = item_list[1] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_retweet'] = json.dumps(item_result) retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) elif len(item_list)==3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) if retweet_bulk_action: es.bulk(retweet_bulk_action, index='1225_retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action: es.bulk(be_retweet_bulk_action, index='1225_be_retweet_'+str(db_number), doc_type='user') retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() #run_type if RUN_TYPE == 0: print '%s sec scan %s count user:' %(end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor==0: break
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(uid_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(uid_list) #compute attribute--keywords, topic, online_pattern #get user topic results by bulk action topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict) #update school attribute---is_school/school_string/school_dict school_results_dict = get_school(uid_list) #get bulk action bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid results['is_school'] = school_results_dict[uid]['is_school'] results['school_string'] = school_results_dict[uid]['school_string'] results['school_dict'] = school_results_dict[uid]['school_dict'] #print 'is_school, school_string, school_dict:', results['is_school'],type(results['is_school']) ,results['school_string'],type(results['school_string']), results['school_dict'], type(results['school_dict']) #add user topic attribute user_topic_dict = topic_results_dict[uid] user_label_dict = topic_results_label[uid] results['topic'] = json.dumps(user_topic_dict) results['topic_string'] = topic_en2ch(user_label_dict) #add user keywords attribute try: keywords_dict = user_keywords_dict[uid] except: keywords_dict = {} keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results['keywords'] = json.dumps(keywords_top50) results['keywords_string'] = keywords_top50_string #add online_pattern try: user_online_pattern = json.dumps(online_pattern_dict[uid]) except: user_online_pattern = json.dumps({}) results['online_pattern'] = user_online_pattern try: results['online_pattern_aggs'] = '&'.join(user_online_pattern.keys()) except: results['online_pattern_aggs'] = '' #add user importance user_domain = user_info_list[uid]['domain'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importance'] = get_importance(user_domain, results['topic_string'], user_fansnum, fansnum_max) #bulk action action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': results}]) #print 'bulk_action:', bulk_action es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete #print '%s sec count %s' % (end_ts - start_ts, len(uid_list)) #log_should_delete start_ts = end_ts
try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD: try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value(new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index':{'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} print 'count:', count if __name__=='__main__': scan_index_history()
def scan_index_history(): s_re = scan(es_user_portrait, query={'query':{'match_all':{}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type) bulk_action = [] add_info = {} count = 0 start_ts = time.time() now_date = ts2datetime(start_ts - DAY) now_date = '2013-09-06' #now_date_string = ''.join(now_date.split('-')) now_date_string = now_date activeness_key = 'activeness_'+now_date_string #influence_key = now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string del_date = ts2datetime(time.time() - DAY*31) #del_date_string = ''.join(del_date.split('-')) del_date_string = del_date del_activeness_key = 'activeness_'+del_date_string #del_influence_key = del_date_string del_influence_key = del_date_string del_importance_key = "importance_" + del_date_string #get max value for importance and activeness max_activeness = get_max_index('activeness') max_influence = get_max_index('influence') max_importance = get_max_index('importance') while True: try: scan_re = s_re.next()['_source'] count += 1 uid = scan_re['uid'] activeness_key = 'activeness_'+now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string #save to normal activeness and normal influence activeness_value = scan_re['activeness'] influence_value = scan_re['influence'] importance_value = scan_re['importance'] normal_activeness = normal_index(activeness_value, max_activeness) normal_influence = normal_index(influence_value, max_influence) normal_importance = normal_index(importance_value, max_importance) add_info[uid] = {activeness_key:normal_activeness, influence_key:normal_influence, importance_key:normal_importance} if count % 1000==0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_date_string = ''.join(s) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) # yuankun-20151229 if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:#更新活跃情况,出库 try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value(new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance #print 'add_info:', add_info[uid] #print 'user_history_item:', user_history_item #print 'new_user_item:', new_user_item action = {'index':{'_id': uid}} #print 'action:', action bulk_action.extend([action, new_user_item]) es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 end_ts = time.time() print '%s sec count 1000' % (end_ts - start_ts) except StopIteration: print 'all done' if len(add_info) != 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD: try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value(new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index':{'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 break except Exception, e: raise e
def scan_retweet(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #retweet/be_retweet es mappings ''' retweet_es_mappings(str(db_number)) be_retweet_es_mappings(str(db_number)) ''' #get redis db retweet_redis = retweet_redis_dict[str(db_number)] retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] ''' if re_scan_cursor == 0: print 'scan finish' if retweet_bulk_action != []: es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action != []: es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user') break ''' for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==2: retweet_count += 1 uid = item_list[1] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_retweet'] = json.dumps(item_result) retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) elif len(item_list)==3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) es.bulk(retweet_bulk_action, index='1225_retweet_'+str(db_number), doc_type='user') es.bulk(be_retweet_bulk_action, index='1225_be_retweet_'+str(db_number), doc_type='user') retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user:'******'count:', count print 'end'
def save_user_results(bulk_action): print 'save utils bulk action len:', len(bulk_action) #print 'bulk action:', bulk_action print es.bulk(bulk_action, index='user_portrait_1222', doc_type=index_type, timeout=600) return True
def scan_comment(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #comment/be_comment es mappings ''' comment_es_mappings(str(db_number)) be_comment_es_mappings(str(db_number)) ''' #get redis db comment_redis = comment_redis_dict[str(db_number)] comment_bulk_action = [] be_comment_bulk_action = [] start_ts = time.time() #comment count/be_comment count comment_count = 0 be_comment_count = 0 while True: re_scan = comment_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list) == 2: comment_count += 1 uid = item_list[1] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_comment'] = json.dumps(item_result) comment_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) ''' elif len(item_list)==3: be_comment_count += 1 uid = item_list[2] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_comment'] = json.dumps(item_result) be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict]) ''' try: es.bulk(comment_bulk_action, index='1225_comment_' + str(db_number), doc_type='user') except: index_name = '1225_comment_' + str(db_number) split_bulk_action(comment_bulk_action, index_name) ''' try: es.bulk(be_comment_bulk_action, index='1225_be_comment_'+str(db_number), doc_type='user') except: index_name = '1225_be_comment_'+str(db_number) split_bulk_action(be_comment_bulk_action, index_name) ''' comment_bulk_action = [] #be_comment_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user' % (end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor == 0: break print 'count:', count print 'end'
def scan_retweet(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #retweet/be_retweet es mappings ''' retweet_es_mappings(str(db_number)) be_retweet_es_mappings(str(db_number)) ''' #get redis db retweet_redis = retweet_redis_dict[str(db_number)] retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] ''' if re_scan_cursor == 0: print 'scan finish' if retweet_bulk_action != []: es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action != []: es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user') break ''' for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list) == 2: retweet_count += 1 uid = item_list[1] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_retweet'] = json.dumps(item_result) retweet_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) elif len(item_list) == 3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) es.bulk(retweet_bulk_action, index='1225_retweet_' + str(db_number), doc_type='user') es.bulk(be_retweet_bulk_action, index='1225_be_retweet_' + str(db_number), doc_type='user') retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user:'******'count:', count print 'end'
def scan_index_history(): s_re = scan(es_user_portrait, query={ 'query': { 'match_all': {} }, 'size': 1000 }, index=portrait_index_name, doc_type=portrait_index_type) bulk_action = [] add_info = {} count = 0 start_ts = time.time() now_date = ts2datetime(start_ts - DAY) now_date = '2013-09-06' #now_date_string = ''.join(now_date.split('-')) now_date_string = now_date activeness_key = 'activeness_' + now_date_string #influence_key = now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string del_date = ts2datetime(time.time() - DAY * 31) #del_date_string = ''.join(del_date.split('-')) del_date_string = del_date del_activeness_key = 'activeness_' + del_date_string #del_influence_key = del_date_string del_influence_key = del_date_string del_importance_key = "importance_" + del_date_string #get max value for importance and activeness max_activeness = get_max_index('activeness') max_influence = get_max_index('influence') max_importance = get_max_index('importance') while True: try: scan_re = s_re.next()['_source'] count += 1 uid = scan_re['uid'] activeness_key = 'activeness_' + now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string #save to normal activeness and normal influence activeness_value = scan_re['activeness'] influence_value = scan_re['influence'] importance_value = scan_re['importance'] normal_activeness = normal_index(activeness_value, max_activeness) normal_influence = normal_index(influence_value, max_influence) normal_importance = normal_index(importance_value, max_importance) add_info[uid] = { activeness_key: normal_activeness, influence_key: normal_influence, importance_key: normal_importance } if count % 1000 == 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget( index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids': uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_date_string = ''.join(s) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[ iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) # yuankun-20151229 if add_info[uid][ influence_key] < LOW_INFLUENCE_THRESHOULD: #更新活跃情况,出库 try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value( new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance #print 'add_info:', add_info[uid] #print 'user_history_item:', user_history_item #print 'new_user_item:', new_user_item action = {'index': {'_id': uid}} #print 'action:', action bulk_action.extend([action, new_user_item]) es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 end_ts = time.time() print '%s sec count 1000' % (end_ts - start_ts) except StopIteration: print 'all done' if len(add_info) != 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget( index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids': uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[ iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD: try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value( new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index': {'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 break except Exception, e: raise e
def scan_comment(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) # get redis db number db_number = get_db_num(now_date_ts) # comment/be_comment es mappings # get redis db comment_redis = comment_redis_dict[str(db_number)] # 1. 判断即将切换的db中是否有数据 while 1: redis_host_list.pop(str(db_number)) other_db_number = comment_redis_dict[redis_host_list[0]] current_dbsize = other_db_number.dbsize() if current_dbsize: break # 已经开始写入新的db,说明前一天的数据已经写完 else: time.sleep(60) # 2. 删除之前的es comment_es_mappings(str(db_number)) be_comment_es_mappings(str(db_number)) # 3. scan comment_bulk_action = [] be_comment_bulk_action = [] start_ts = time.time() # comment count/be_comment count comment_count = 0 be_comment_count = 0 while True: re_scan = comment_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split("_") save_dict = {} if len(item_list) == 2: comment_count += 1 uid = item_list[1] item_result = comment_redis.hgetall(item) save_dict["uid"] = uid save_dict["uid_comment"] = json.dumps(item_result) comment_bulk_action.extend([{"index": {"_id": uid}}, save_dict]) elif len(item_list) == 3: be_comment_count += 1 uid = item_list[2] item_result = comment_redis.hgetall(item) save_dict["uid"] = uid save_dict["uid_be_comment"] = json.dumps(item_result) be_comment_bulk_action.extend([{"index": {"_id": uid}}, save_dict]) try: es.bulk(comment_bulk_action, index="1225_comment_" + str(db_number), doc_type="user") except: index_name = "1225_comment_" + str(db_number) split_bulk_action(comment_bulk_action, index_name) try: es.bulk(be_comment_bulk_action, index="1225_be_comment_" + str(db_number), doc_type="user") except: index_name = "1225_be_comment_" + str(db_number) split_bulk_action(be_comment_bulk_action, index_name) comment_bulk_action = [] be_comment_bulk_action = [] end_ts = time.time() # run_type if RUN_TYPE == 0: print "%s sec scan %s count user" % (end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor == 0: break # 4. flush redis comment_redis.flushdb()
def save_user_results(bulk_action): print es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=60) return True
count_n = 0 search_list = [] user_list = [] while 1: try: scan_re = s_re.next()['_source'] count += 1 user_list.append(scan_re) if count % 1000 == 0: bulk_action, count_n, tb = co_search(es, user_list, bulk_action, count_n, tb) user_list = [] except StopIteration: print "all done" bulk_action, count_n, tb = co_search(es, user_list, bulk_action, count_n,tb) break except Exception, r: print Exception, r sys.exit(0) if bulk_action: es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30) print count, count_n #数字相等才匹配 # 2. 打印终止信息 now_ts = ts2datetime(time.time()) print_log = "&".join([file_path, "end", now_ts]) print print_log
def save_user_results(bulk_action): #print 'bulk_action:', bulk_action[0:2] es.bulk(bulk_action, index=index_name, doc_type=index_type) return True
while 1: try: scan_re = s_re.next()['_source'] count += 1 user_list.append(scan_re) if count % 1000 == 0: bulk_action, count_n, tb = co_search(es, user_list, bulk_action, count_n, tb) user_list = [] except StopIteration: print "all done" bulk_action, count_n, tb = co_search(es, user_list, bulk_action, count_n, tb) break except Exception, r: print Exception, r sys.exit(0) if bulk_action: es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30) print count, count_n #数字相等才匹配 # 2. 打印终止信息 now_ts = ts2datetime(time.time()) print_log = "&".join([file_path, "end", now_ts]) print print_log
def scan_comment(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #comment/be_comment es mappings #get redis db comment_redis = sensitive_comment_redis_dict[str(db_number)] # 1. 判断即将切换的db中是否有数据 sensitive_redis_host_list.remove(str(db_number)) while 1: other_db_number = comment_redis_dict[redis_host_list[0]] current_dbsize = other_db_number.dbsize() if current_dbsize: break # 已经开始写入新的db,说明前一天的数据已经写完 else: time.sleep(60) # 2. 删除之前的es comment_es_mappings(str(db_number)) be_comment_es_mappings(str(db_number)) # 3. scan comment_bulk_action = [] be_comment_bulk_action = [] start_ts = time.time() #comment count/be_comment count comment_count = 0 be_comment_count = 0 while True: re_scan = comment_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list) == 2: comment_count += 1 uid = item_list[1] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_comment'] = json.dumps(item_result) comment_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) elif len(item_list) == 3: be_comment_count += 1 uid = item_list[2] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_comment'] = json.dumps(item_result) be_comment_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) #try: if comment_bulk_action: es.bulk(comment_bulk_action, index=sensitive_comment_index_name_pre + str(db_number), doc_type='user') #except: # index_name = '1225_comment_'+str(db_number) # split_bulk_action(comment_bulk_action, index_name) #try: if be_comment_bulk_action: es.bulk(be_comment_bulk_action, index=sensitive_be_comment_index_name_pre + str(db_number), doc_type='user') #except: # index_name = '1225_be_comment_'+str(db_number) # split_bulk_action(be_comment_bulk_action, index_name) comment_bulk_action = [] be_comment_bulk_action = [] end_ts = time.time() #run_type #if RUN_TYPE == 1: print '%s sec scan %s count user' % (end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor == 0: break # 4. flush redis comment_redis.flushdb()
user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD: try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value( new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index': {'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} print 'count:', count if __name__ == '__main__': scan_index_history()