def update_domain2leveldb(): # 从leveldb更新leveldb的用户领域所属数据 # test 0.15 seconds per 10000 users, total 22670000 users, 0.09 h count = 0 ts = te = time.time() for k, v in domain_leveldb.RangeIter(): uid, datestr = k.split('_') domainid = DOMAIN_LIST.index(v) try: active, important, follower, _domain = daily_identify_aifd_bucket.Get(str(uid)).split('_') except KeyError: active = 0 important = 0 follower = 0 domain = domainid daily_identify_aifd_bucket.Put(str(uid), str(active) + '_' + str(important) + '_' + \ str(follower) + '_' + str(domain)) if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts), ' identify person domain', now_datestr ts = te count += 1
def user2domainFromLeveldb(uid, updatetime='20131220'): try: domainstr = user_domain_bucket.Get(str(uid) + '_' + str(updatetime)) domainid = DOMAIN_LIST.index(domainstr) except: domainid = -1 return domainid
def user2domain(uid, updatetime="20131220"): try: v = domain_leveldb.Get(str(uid) + "_" + str(updatetime)) domainid = DOMAIN_LIST.index(v) except KeyError: domainid = 20 return domainid
def userLeveldb2Domain(uid, updatetime='20131220'): try: v = spieduser_bucket.Get(str(uid) + '_' + str(updatetime)) domainid = DOMAIN_LIST.index(v) except KeyError: domainid = -1 return domainid
def sentiment_field(domain, xapian_search_weibo, start_ts, over_ts, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, during=Hour, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if domain_uids != []: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'domain %s starts calculate' % domain query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for uid in domain_uids: query_dict['$or'].append({'user': uid}) for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws) print domain, date, ' %s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts) save_count_results(DOMAIN_LIST.index(domain), emotions_count, during) save_kcount_results(DOMAIN_LIST.index(domain), emotions_kcount, during, TOP_KEYWORDS_LIMIT) save_weibos_results(DOMAIN_LIST.index(domain), emotions_weibo, during, TOP_WEIBOS_LIMIT)
def _add_domain_usersFromLeveldb(updatetime='20131220'): try: spieduser_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'spiedusers_4'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) except: print 'spieduser_bucket leveldb not available now' for k, v in spieduser_bucket.RangeIter(): uid, updatetime = k.split('_') uid = int(uid) domain = str(v) domainid = int(DOMAIN_LIST.index(domain)) r.sadd(DOMAIN_USERS % domainid, uid)
def _add_all_user_domain(r): '''test 10000 users per second ''' count = 0 ts = te = time.time() for k, v in spieduser_bucket.RangeIter(): uid, updatetime = k.split('_') uid = int(uid) domainid = DOMAIN_LIST.index(v) r.hset(USER_DOMAIN, uid, domainid) if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts) ts = te count += 1