def split_bulk_action(bulk_action, index_name): new_bulk_action = [] for i in range(0, len(bulk_action)): if i % 2 == 0: new_bulk_action = [bulk_action[i], bulk_action[i+1]] try: es.bulk(new_bulk_action, index=index_name, doc_type='user') except: print 'cron/flow3/scan_redis2es_retweet.py&error-1&'
def scan_retweet(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #get redis db retweet_redis = retweet_redis_dict[str(db_number)] # 1. 判断即将切换的db中是否有数据 print redis_host_list, str(db_number) if RUN_TYPE: redis_host_list.remove(str(db_number)) while 1: other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis current_dbsize = other_db_number.dbsize() if current_dbsize: break # 已经开始写入新的db,说明前一天的数据已经写完 else: time.sleep(60) # 2. 删除之前的es retweet_es_mappings(str(db_number)) be_retweet_es_mappings(str(db_number)) # 3. scan retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==2: retweet_count += 1 uid = item_list[1] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_retweet'] = json.dumps(item_result) retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) elif len(item_list)==3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) if retweet_bulk_action: es.bulk(retweet_bulk_action, index=retweet_index_name_pre+str(db_number), doc_type='user') if be_retweet_bulk_action: es.bulk(be_retweet_bulk_action, index=be_retweet_index_name_pre+str(db_number), doc_type='user') retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() #run_type #if RUN_TYPE == 1: print '%s sec scan %s count user:' %(end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor==0: break # 4. flush redis if RUN_TYPE: retweet_redis.flushdb()