def followers_rank(top_n, date, window_size): #user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) count, get_results = user_search.search(query={'followers_count': {'$gt': FOLLOWERS_MIN_SUPPORT}}, sort_by=['-followers_count'], fields=['_id'], max_offset=top_n) sorted_uids = [] print count for user in get_results(): sorted_uids.append(user['_id']) return sorted_uids
def make(date): end_ts = datetime2ts(date) start_ts = end_ts - 24*60*60 db_name = get_leveldb('impotant', end_ts) daily_user_important_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) batch = leveldb.WriteBatch() query_dict = {'timestamp': {'$gt': start_ts, '$lt': end_ts}, 'reposts_count': {'$gt': 500}} statuses_count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', '_id', 'reposts_count']) print 'total statuses: %s' % statuses_count print 'writing to levelDB %s...' % db_name count = 0 uid_important = {} for status in get_statuses_results(): if count % 10000 == 0: print 'current count: %s' % count uid = status['user'] reposts_count = status['reposts_count'] followers_count = 0 user_count, get_user_results = user_search.search(query={'_id': uid}) if user_count == 1: for user in get_user_results(): followers_count = user['followers_count'] important = 0.9 * reposts_count + 0.1 * followers_count if uid not in uid_important: uid_important[uid] = 0 important += uid_important[uid] uid_important[uid] = important print uid,important batch.Put(str(uid), str(important)) count += 1 daily_user_important_bucket.Write(batch, sync=True) print 'done.'
def get_superior_userid(weibo): text = weibo['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) repost_chains = RE.findall(text) # 直接上级就是转发的源头节点,这种情况下在微博文本中不存正则表达匹配的内容 ''' # get direct_superior_name reposts_name = set() if repost_chains!=[]: repost_name = repost_chains[0] else: repost_name = None return repost_name ''' if (weibo['retweeted_uid'] != 0 and weibo['retweeted_uid']) and (repost_chains == []): direct_superior_id = weibo['retweeted_uid'] return direct_superior_id if repost_chains != []: direct_superior_name = repost_chains[0] count, results = user_search.search( query={'name': direct_superior_name}, fields=['_id', 'name']) if count != 0: for result in results(): direct_superior_id = result['_id'] else: direct_superior_id = None #direct_superior_id = find_in_mongo(direct_superior_name) # 在mongodb中查询 if not direct_superior_id: #direct_superior_id = find_by_scripy(direct_superior_name) direct_superior_id = None else: direct_superior_name = None direct_superior_id = None return direct_superior_id
def get_superior_userid(weibo): text = weibo['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) repost_chains = RE.findall(text) # 直接上级就是转发的源头节点,这种情况下在微博文本中不存正则表达匹配的内容 ''' # get direct_superior_name reposts_name = set() if repost_chains!=[]: repost_name = repost_chains[0] else: repost_name = None return repost_name ''' if (weibo['retweeted_uid']!=0 and weibo['retweeted_uid']) and (repost_chains == []): direct_superior_id = weibo['retweeted_uid'] return direct_superior_id if repost_chains!=[]: direct_superior_name = repost_chains[0] count, results = user_search.search(query={'name':direct_superior_name}, fields=['_id', 'name']) if count != 0: for result in results(): direct_superior_id = result['_id'] else: direct_superior_id = None #direct_superior_id = find_in_mongo(direct_superior_name) # 在mongodb中查询 if not direct_superior_id: #direct_superior_id = find_by_scripy(direct_superior_name) direct_superior_id = None else: direct_superior_name = None direct_superior_id = None return direct_superior_id
def get_user(uid): user = {} count,get_results = user_search.search(query={'_id': uid}) for r in get_results(): user['id'] = r['_id'] user['province'] = r['province'] user['bi_followers_count'] = 'None' user['verified'] = r['verified'] user['description'] = r['description'] if not r['friends_count']: user['friends_count'] = 0 else: user['friends_count'] = r['friends_count'] user['city'] = r['city'] user['gender'] = r['gender'] user['profile_image_url'] = r['profile_image_url'] user['verified_reason'] = 'None' if not r['followers_count']: user['followers_count'] = 0 else: user['followers_count'] = r['followers_count'] user['location'] = r['location'] if not r['statuses_count']: user['statuses_count'] = 0 else: user['statuses_count'] = r['statuses_count'] if r['name']: user['name'] = r['name'] else: user['name'] = u'未知用户' break if user == {}: return None else: return user