def update_follower2leveldb(): # 从leveldb更新leveldb的用户粉丝数数据 # test 0.15 seconds per 10000 users, total 22670000 users, 0.09 h users = xapian_search_user.iter_all_docs(fields=['user', 'followers_count']) count = 0 ts = te = time.time() for k, v in user_followers_count_leveldb.RangeIter(): uid = int(k) follower = int(v) try: active, important, _follower, domain = daily_identify_aifd_bucket.Get(str(uid)).split('_') except KeyError: active = 0 important = 0 domain = 20 daily_identify_aifd_bucket.Put(str(uid), str(active) + '_' + str(important) + '_' + \ str(follower) + '_' + str(domain)) if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts), ' identify person follower', now_datestr ts = te count += 1
def iter_userbasic2leveldb(): users = xapian_search_user.iter_all_docs(fields=xapian_user_fields) count = 0 batch = leveldb.WriteBatch() ts = te = time.time() for user in users: if count % 10000 == 0: te = time.time() daily_profile_person_basic_db.Write(batch, sync=True) batch = leveldb.WriteBatch() print count, '%s sec' % (te - ts), 'xapian2leveldb person basic' ts = te # extraction and transfer try: userId = int(user['_id']) except: count += 1 continue province = user['province'] city = user['city'] verified = user['verified'] name = _utf_encode(user['name']) friendsCount = user['friends_count'] gender = user['gender'] profileImageUrl = user['profile_image_url'] verifiedType = user['verified_type'] followersCount = user['followers_count'] location = _utf_encode(user['location']) statusesCount = user['statuses_count'] description = _utf_encode(user['description']) domain = userLeveldb2DomainZh(userId) try: created_at = int(user['created_at']) except: count += 1 continue date = batch_date_1 #Load key = str(userId) value = '_\/'.join([str(province), str(city), str(verified), \ str(name), str(friendsCount), str(gender), \ str(profileImageUrl), str(verifiedType), \ str(followersCount), str(location), \ str(statusesCount), str(description), \ str(created_at), str(domain)]) batch.Put(key, value) count += 1
def user_name_uid_xapian2redis(): ''' test 2 secondes per 10000 ''' count = 0 ts = te = time.time() users = xapian_search_user.iter_all_docs(fields=['name', '_id']) for user in users: name = user['name'] uid = user['_id'] global_r0.hset(USER_NAME_UID, name, int(uid)) count += 1 if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts) ts = te
def follower_leveldb(): # test 10 thousand per second get_results = xapian_search_user.iter_all_docs(fields=['_id', 'followers_count']) count = 0 ts = te = time.time() for result in get_results: field_daily_active_count_bucket.Put(str(result['_id']), str(result['followers_count'])) if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts), 'identify followers_count to leveldb ' ts = te count = count + 1 return 'Done'
def batch_handle_domain_basic(): count = 0 ts = te = time.time() users = xapian_search_user.iter_all_docs(fields=['_id', 'verified', 'location']) for user in users: if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts), ' %s daily domain basic' % batch_date_1 ts = te domainid = userLeveldb2Domain(user['_id']) verified = user['verified'] province_str = user['location'].split(' ')[0] try: verified_count, unverified_count, province_dict = daily_profile_domain_basic_db.Get(str(domainid)).split('_\/') verified_count = int(verified_count) unverified_count = int(unverified_count) province_dict = json.loads(province_dict) except KeyError: verified_count = unverified_count = 0 province_dict = {} if verified: verified_count += 1 else: unverified_count += 1 try: province_dict[province_str] += 1 except KeyError: province_dict[province_str] = 1 key = str(domainid) value = '_\/'.join([str(verified_count), str(unverified_count), json.dumps(province_dict)]) daily_profile_domain_basic_db.Put(key, value) count += 1
def calFieldByFriends(): protousers = readProtoUser() iter_count = 0 ts = te = time.time() users = xapian_search_user.iter_all_docs(fields=['friends']) for user in users: area_dict = {} friends = user['friends'] for fri in friends: try: area = protousers[fri] area_dict[area] += 1 except KeyError: pass if area_dict != {}: area_counts = sorted(area_dict.iteritems(), key=itemgetter(1), reverse=True) if len(area_counts) == 1: areas = area_counts[0][0] else: areas = area_counts[0][0] + ',' + area_counts[1][0] try: e_areas = global_user_field_bucket.Get(str(uid) + '_' + update_datestr) areas = ','.join(e_areas.split(',') + areas.split(',')) except KeyError: pass global_user_field_bucket.Put(str(uid) + '_' + update_datestr, areas) if iter_count % 10000 == 0: te = time.time() print iter_count, '%s sec' % (te - ts) ts = te iter_count += 1
# -*- coding: utf-8 -*- import os import time import leveldb from config import xapian_search_user, LEVELDBPATH user_name_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_user_name'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) users = xapian_search_user.iter_all_docs(fields=['user', 'name']) count = 0 ts = te = time.time() for user in users: uid = user['user'] name = user['name'] user_name_bucket.Put(str(name.encode('utf-8')), str(uid)) if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts), ' user name to leveldb' ts = te count += 1