def whole_followers_task(top_n, date, window_size): user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) count, get_results = user_search.search(query={'followers_count': {'$gt': FOLLOWERS_MIN_SUPPORT}}, sort_by=['-followers_count'], fields=['_id'], max_offset=top_n) sorted_uids = [] for user in get_results(): sorted_uids.append(user['_id']) return sorted_uids
def load_friendship_to_leveldb(): from xapian_weibo.xapian_backend import XapianSearch s_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) query_dict = { '_id': { '$gt': 0, } } count, get_results = s_user.search(query=query_dict, fields=['_id', 'friends', 'followers']) print count count = 0 ts = te = time.time() for r in get_results(): uid = r['_id'] friends = r['friends'] followers = r['followers'] if friends and len(friends): k = str(uid) + '_' + 'friends' v = json.dumps(friends) friendship_bucket.Put(k, str(v)) if followers and len(followers): k = str(uid) + '_' + 'followers' v = json.dumps(followers) friendship_bucket.Put(k, str(v)) count += 1 if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts) ts = te
def get_user(uid): user = {} s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user') count,get_results = s.search(query={'_id': uid}) for r in get_results(): user['id'] = r['_id'] user['province'] = r['province'] user['bi_followers_count'] = r['bi_followers_count'] user['verified'] = r['verified'] user['description'] = r['description'].decode("utf-8") user['friends_count'] = r['friends_count'] user['city'] = r['city'] user['gender'] = r['gender'] user['profile_image_url'] = r['profile_image_url'] user['verified_reason'] = r['verified_reason'].decode("utf-8") user['followers_count'] = r['followers_count'] user['location'] = r['location'].decode("utf-8") user['active'] = r['active'] user['statuses_count'] = r['statuses_count'] if r['name']: user['name'] = r['name'].decode("utf-8") else: user['name'] = u'未知用户' user['userField'] = u'未知领域' break if user == {}: return None else: return user
def make_network(topic, date, window_size, max_size=100000, ts=False): end_time = datetime2ts(date) start_time = end_time - window2time(window_size) g = nx.DiGraph() #need repost index topic = cut(s, topic.encode('utf-8')) statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}} if ts: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size) else: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size) print 'topic statuses count %s' % count if ts: uid_ts = {} for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] repost_ts = int(status['timestamp']) source_status = acquire_status_by_id(rt_mid) source_uid = source_status['user'] source_ts = int(source_status['timestamp']) if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue if repost_uid not in uid_ts: uid_ts[repost_uid] = repost_ts else: if uid_ts[repost_uid] > repost_ts: uid_ts[repost_uid] = repost_ts if source_uid not in uid_ts: uid_ts[source_uid] = source_ts else: if uid_ts[source_uid] > source_ts: uid_ts[source_uid] = source_ts g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return uid_ts, g else: for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] source_uid = acquire_status_by_id(rt_mid)['user'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return g
def search_test(date): start_ts = datetime2ts(date) end_ts = start_ts + 24*60*60 statuses_search = XapianSearch(path=XAPIAN_STATUSES_PATH, name='master_timeline_weibo', schema_version=2) query_dict = {'timestamp': {'$gt': start_ts, '$lt': end_ts}} statuses_count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', '_id', 'retweeted_status']) count = 0 start_time = time.time() for status in get_statuses_results(): count += 1 print 'total statuses %s' % count
def search_single(): search_start_ts = time.time() xapian_weibo = XapianSearch(stub=stub_files, schema_version=5) count, get_results = xapian_weibo.search(query={"text": keywords_arg}, fields=xapian_fields) count = 0 search_end_ts = time.time() print "search single %d" % (search_end_ts - search_start_ts) tb = time.time() ts = tb for r in get_results(): if count % 10000 == 0: te = time.time() print "[%s] read speed: %s sec/per %s" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), te - ts, 10000) ts = te count += 1 print count
def make(date, hour): ts = datetime2ts(date) start_ts = datetime2ts(date) + (hour-1)*60*60 end_ts = start_ts + hour*60*60 db_name = get_leveldb(ts, hour) hourly_user_burst_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) batch = leveldb.WriteBatch() query_dict = {'timestamp': {'$gt': start_ts, '$lt': end_ts}, 'reposts_count': {'$gt': 100}} statuses_count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', '_id', 'reposts_count']) print 'total statuses: %s' % statuses_count print 'writing to levelDB %s...' % db_name batch.Put('size', str(statuses_count)) count = 0 uid_burst = {} for status in get_statuses_results(): if count % 10000 == 0: print 'current count: %s' % count uid = status['user'] reposts_count = status['reposts_count'] followers_count = 0 if uid not in uid_burst: uid_burst[uid] = 0 reposts_count += uid_burst[uid] uid_burst[uid] = reposts_count batch.Put(str(uid), str(reposts_count)) count += 1 hourly_user_burst_bucket.Write(batch, sync=True) print 'done.'
def test_search(stub_file): search_start_ts = time.time() xapian_weibo = XapianSearch(stub=stub_file, schema_version=5) count, get_results = xapian_weibo.search(query={"text": keywords_arg}, fields=xapian_fields) pid_num = os.getpid() search_end_ts = time.time() print "Working in Process #%d, %d, search uses %d seconds" % (pid_num, count, search_end_ts - search_start_ts) fw = open("./cache/%s.txt" % pid_num, "w") count = 0 tb = time.time() ts = tb for r in get_results(): fw.write("%s\n" % json.dumps(r)) if count % 10000 == 0: te = time.time() print "process [%s] write speed: %s sec/per %s" % (pid_num, te - ts, 10000) ts = te count += 1 fw.close() return pid_num
def setUp(self): self.n = 10000 self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.weibo_ids = self._load_weibo_ids_from_xapian(self.n)
class BenchXapianGetByIds(hurdles.BenchCase): def setUp(self): self.n = 10000 self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.weibo_ids = self._load_weibo_ids_from_xapian(self.n) def tearDown(self): pass def _load_weibo_ids_from_xapian(self, limit): begin_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2013, 1, 2).timetuple()) query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } count, get_results = self.s.search(query=query_dict, max_offset=limit, fields=['_id']) print count ids = [] for r in get_results(): ids.append(r['_id']) return ids def bench_1(self): for _id in self.weibo_ids: query_dict = {'_id': _id} count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_10(self): size = 10 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size: (i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_20(self): size = 20 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size: (i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_30(self): size = 30 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size: (i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_50(self): size = 50 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size: (i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])
class BenchXapianGetByIds(hurdles.BenchCase): def setUp(self): self.n = 10000 self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.weibo_ids = self._load_weibo_ids_from_xapian(self.n) def tearDown(self): pass def _load_weibo_ids_from_xapian(self, limit): begin_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2013, 1, 2).timetuple()) query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, } count, get_results = self.s.search(query=query_dict, max_offset=limit, fields=['_id']) print count ids = [] for r in get_results(): ids.append(r['_id']) return ids def bench_1(self): for _id in self.weibo_ids: query_dict = {'_id': _id} count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_10(self): size = 10 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size:(i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_20(self): size = 20 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size:(i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_30(self): size = 30 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size:(i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text']) def bench_50(self): size = 50 for i in xrange(self.n / size): query_dict = { '$or': [], } for _id in self.weibo_ids[i * size:(i + 1) * size]: query_dict['$or'].append({'_id': _id}) count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])
if not line: break else: record=line.split() swds.append(record[0]) classes=["education","culture", "fashion",'entertainment',"finance", "media", "sports", "technology"] ##cc = opencc.OpenCC('mix2s')##繁简体转换 ####generate seed users ###period b=datetime.datetime(2012,10,1) tb=time.mktime(b.timetuple()) e=datetime.datetime(2013,10,1) te=time.mktime(e.timetuple()) s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline')##search by index s1 = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline')##search for original tweet total_uid_set = set() u_seed=[]##seed users u_cls={}##user and class mapping w_user={}##word segmentation n_s=0##number of seed users for each class sw_cls={}##seed user words statistics for each class sw_cls=cinitialize(classes,sw_cls,2) sw={}##words from all seed users wp_cls={}##protowords for each class wp_cls=cinitialize(classes,wp_cls,1) Nswds=0 exception=0 for area in classes:
from operator import itemgetter, attrgetter import pymongo import datetime import time import sys import leveldb import os from xapian_weibo.xapian_backend import XapianSearch LEVELDBPATH = '/home/mirage/leveldb' global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field_20131012'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) xapian_search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)##search by index xapian_search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)##search by index mbr = {"culture":0, "entertainment":0, "fashion":0,'education':0,"finance":0, "sports":0, "technology":0,'media':0} fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology'] def readProtoUser(): f=open("/home/mirage/linhao/project_bishe/weibo/profile/user_classify/protou.txt","r") protou={} for line in f: area=line.split(":")[0] if area not in protou: protou[area]=set() for u in (line.split(":")[1]).split(): protou[area].add(int(u)) return protou
# -*- coding: utf-8 -*- from xapian_weibo.xapian_backend import XapianSearch import leveldb import datetime import time import os s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline') LEVELDBPATH = '/home/mirage/leveldb' weibo_multi_sentiment_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'huyue_weibo_multi_sentiment'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) user_daily_sentiment_count_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_user_daily_sentiment_count'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) emotions_kv = {'happy': 1, 'angry': 2, 'sad': 3} total_days = 89 today = datetime.datetime.today() now_ts = time.mktime(datetime.datetime(today.year, today.month, today.day, 2, 0).timetuple()) now_ts = int(now_ts) during = 24 * 3600 begin_ts = now_ts - total_days * during query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': now_ts} } count, get_results = s.search(query=query_dict, fields=['user', 'id', 'timestamp']) print count
def setUp(self): self.weibo_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.user_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_user', schema_version=1) self.begin_ts = time.mktime(datetime.datetime(2011, 12, 1).timetuple()) self.end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple())
tks = [ token for token in s.participle(cut_filter(text)) if 3 < len(token[0]) < 30 or token[0] in single_word_whitelist ] if cx: return tks else: return [tk[0] for tk in tks] ##加载分词工具结束 ##加载xapian读取用户的认证类型 XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/' xs = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) def read_by_xapian(xs, uid): #根据用户id,去xapian里面查找该用户的背景信息 count, get_results = xs.search(query={'_id': uid}) if count: for r in get_results(): return r else: return 'other' ##加载xapian数据结束
class BenchXapianR(hurdles.BenchCase): def setUp(self): self.weibo_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.user_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_user', schema_version=1) self.begin_ts = time.mktime(datetime.datetime(2011, 12, 1).timetuple()) self.end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple()) def tearDown(self): pass """ def bench_load_users(self): query_dict = { 'created_at': { '$gt': self.begin_ts, '$lt': self.end_ts, } } count, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name']) print count def bench_load_users_then_sort(self): query_dict = { 'created_at': { '$gt': self.begin_ts, '$lt': self.end_ts, } } count, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name'], sort_by=['created_at']) print count def bench_load_weibos(self): query_dict = { 'timestamp': { '$gt': self.begin_ts, '$lt': self.end_ts, } } count, get_results = self.weibo_x.search(query=query_dict, fields=['_id', 'user']) print count """ def bench_get_results_weibos(self): query_dict = { 'timestamp': { '$gt': self.begin_ts, '$lt': self.end_ts, } } _, get_results = self.weibo_x.search(query=query_dict, fields=['_id', 'user']) for r in get_results(): _id = r['_id'] def bench_get_results_users(self, *args, **kwargs): query_dict = { 'created_at': { '$gt': self.begin_ts, '$lt': self.end_ts, } } _, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name']) for r in get_results(): _id = r['_id'] """
# -*- coding:utf-8 -*- import time import datetime from xapian_weibo.xapian_backend import XapianSearch s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) begin_ts = time.mktime(datetime.datetime(2011, 1, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple()) """ query_dict = { 'created_at': { '$gt': begin_ts, '$lt': end_ts, } } count, get_results = s.search(query=query_dict, max_offset=1, fields=['_id', 'name'], sort_by=['created_at']) print count for r in get_results(): print r['_id'], r['name'] """ """ query_dict = { '$or': [ {'_id': 1934744637},
# -*- coding: utf-8 -*- from xapian_weibo.xapian_backend import XapianSearch ''' search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) print search_weibo.search(query={'_id': {'gt': 0, 'lt': 30000000000000000000000000}}, count_only=True) search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) print search_user.search(query={'_id': {'gt': 0, 'lt': 300000000000000}}, count_only=True) ''' begin_ts = 0 end_ts = 12349873894898 query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } xapian_search_sentiment = XapianSearch(path='/opt/xapian_weibo/data/20130807', name='master_timeline_sentiment', schema_version=3) print xapian_search_sentiment.search(query=query_dict, count_only=True)
import leveldb import os import random import sys LEVELDBPATH = '/home/mirage/leveldb' global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) user_daily_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_user_daily_field'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) from xapian_weibo.xapian_backend import XapianSearch user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) query_dict = { '_id':{ '$gt': 0, '$lt': 100000000000000000000 } } count, get_results = user_search.search(query=query_dict) print 'user count: ', count fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology'] count = 0 for r in get_results(): uid = r['_id']
# -*- coding: utf-8 -*- import json import time from xapian_weibo.xapian_backend import XapianSearch XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/' xs = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) with open('total_users.json', 'w') as f: record = 1 ts = time.time() tm = ts for count, item in enumerate(xs.iter_all_docs()):#遍历数组的索引值和元素 f.write(json.dumps(item) + '\n') if (count + 1) % 100000 == 0: te = time.time() span = round(te - tm) print '%s chunk spend: %s' % (record, span) record += 1 tm = time.time() print 'total docs count: ', (count+1) te = time.time() total_span = round(te - ts) print 'total spend: %s' % total_span
# -*- coding: utf-8 -*- import pymongo, time, codecs, datetime try: from xapian_weibo.xapian_backend import XapianSearch statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) except: pass def con_database(): DB_HOST = '219.224.135.60' DB_PORT = 27017 DB_USER = '******' DB_PWD = 'root' connection = pymongo.Connection(DB_HOST, DB_PORT) db = connection.admin db.authenticate(DB_USER, DB_PWD) return connection.test_crawler_liwenwen def main(uid, startdate, enddate): startts = date2ts(startdate) endts = date2ts(enddate) db = con_database() print db.users.find({'uid': str(uid), 'ts':{'$gte': startts, '$lte': endts}}).count() cursor = db.users.find({'uid': str(uid), 'ts':{'$gte': startts, '$lte': endts}}) for weibo in cursor: print weibo def date2ts(date): return int(time.mktime(time.strptime(date, '%Y-%m-%d')))
def calculate_topic(kw): #初始化 topic_info = {} topic_index = {} date_list = [] perday_count_list = [] topic_rel_blog = [] topic_url = [] topic_participents = [] topic_leader = [] topic_date = [] blogs_sum = 0 comments_sum = 0 topic_ori_blog = [] city_count={} html = '''<select name="province" id="province" defvalue="11"><option value="34">安徽</option><option value="11">北京</option><option value="50">重庆</option><option value="35">福建</option><option value="62">甘肃</option> <option value="44">广东</option><option value="45">广西</option><option value="52">贵州</option><option value="46">海南</option><option value="13">河北</option> <option value="23">黑龙江</option><option value="41">河南</option><option value="42">湖北</option><option value="43">湖南</option><option value="15">内蒙古</option><option value="32">江苏</option> <option value="36">江西</option><option value="22">吉林</option><option value="21">辽宁</option><option value="64">宁夏</option><option value="63">青海</option><option value="14">山西</option><option value="37">山东</option> <option value="31">上海</option><option value="51">四川</option><option value="12">天津</option><option value="54">西藏</option><option value="65">新疆</option><option value="53">云南</option><option value="33">浙江</option> <option value="61">陕西</option><option value="71">台湾</option><option value="81">香港</option><option value="82">澳门</option><option value="400">海外</option><option value="100">其他</option></select>''' province_soup = BeautifulSoup(html) for province in province_soup.findAll('option'): pp = province.string if pp == u'海外' or pp == u'其他': continue city_count[pp] = 0 gt = calendar.timegm(datetime(2012, 1, 1).timetuple()) lt = calendar.timegm(datetime(2012, 1, 10).timetuple()) s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) count, get_results = s.search(query={'text': [u'%s'%kw], 'timestamp': {'$gt': gt, '$lt': lt}}, sort_by=['timestamp'], fields=['text', 'timestamp','reposts_count','comments_count','user', 'terms', '_id','retweeted_status','bmiddle_pic','geo','source','attitudes_count']) for r in get_results(): # 获取时间与每天微博数量 temp_date = date.fromtimestamp(r['timestamp']) if len(date_list) == 0: date_list.append(temp_date) perday_count_list.append(1) else: if temp_date < date_list[-1]: if temp_date in date_list: temp_index = date_list.index(temp_date) perday_count_list[temp_index] += 1 else: i = 0 while i < len(date_list): if temp_date < date_list[0]: date_list.insert(0,temp_date) perday_count_list.insert(0,1) break else: if temp_date > date_list[i] and temp_date < date_list[i+1]: date_list.insert(i+1,temp_date) perday_count_list.insert(i+1,1) break else: i += 1 if temp_date == date_list[-1]: perday_count_list[-1] += 1 if temp_date > date_list[-1]: timedelta = date(2000,1,2)-date(2000,1,1) while date_list[-1] != temp_date: temp_date1 = date_list[-1] + timedelta date_list.append(temp_date1) perday_count_list.append(0) perday_count_list[-1] = 1 if r['user']: uid = int(r['user']) user = get_user(uid) if user != None: if user not in topic_participents: topic_participents.append(user) if r['retweeted_status'] == None: temp_ori = {} temp_ori['status'] = r temp_ori['user'] = user topic_ori_blog.append(temp_ori) if r['reposts_count'] != None and r['comments_count'] != None: rc = r['reposts_count'] + r['comments_count'] if rc > 1500: topic_leader.append(user) if r['reposts_count'] > 1000: temp = {} temp['status'] = r temp['status']['created_at'] = datetime.fromtimestamp(r['timestamp']) temp['status']['text'] = r['text'].decode("utf-8") temp['status']['source'] = re.match('<.*?>(.*)<.*?>', r['source']).group(1).decode("utf-8") temp['user'] = user topic_rel_blog.append(temp) if r['bmiddle_pic']: topic_url.append(r['bmiddle_pic']) if r['geo'] != None and r['geo'].has_key('province_name'): p = r['geo']['province_name'].split('省')[0] if p == u'海外' or p == u'其他': pass else: city_count[p] += 1 elif user['location']: p = user['location'].split(' ')[0] if p == u'海外' or p == u'其他': pass else: city_count[p] += 1 else: pass else: pass comments_sum = comments_sum + r['comments_count'] blogs_sum += 1 timedelta = len(date_list) avg = blogs_sum/float(timedelta) i = 0 persistent_index = 0 temp_sudden = 0 while i < int(timedelta): if perday_count_list[i] > avg: persistent_index += 1 temp_sudden = perday_count_list[i]-avg+temp_sudden i += 1 else: i += 1 sudden_index = '%10.2f'%(temp_sudden/float(blogs_sum)) coverage_index = '%10.2f'%((blogs_sum + comments_sum)/(24*float(timedelta))) media_index = 0 top_medias = [] medias = db.session.query(Media) for media in medias: media_name = media.mediaName top_medias.append(media_name) media_list = [] for r in topic_ori_blog: tmedia = [] tmedia.append(r['user']['name']) x = r['status']['comments_count']+r['status']['reposts_count'] tmedia.append(x) media_list.append(tmedia) sorted(media_list, key=lambda tmedia: tmedia[1],reverse = True) if len(media_list) >= 20: m = 0 while m < 20: if media_list[m][0] in top_medias: media_index += 1 m += 1 else: m += 1 else: m = 0 while m < len(media_list): if media_list[m][0] in top_medias: media_index += 1 m += 1 else: m += 1 leader_index = len(topic_leader) work_list = [] work_count = [] fields = db.session.query(Field) for field in fields: field_name = field.fieldName work_list.append(field_name) work_count.append(0) for r in topic_participents: k = 0 while k < len(work_list): if r['userField'] == work_list[k]: work_count[k] += 1 break else: k += 1 topic_index['persistent_index'] = persistent_index topic_index['sudden_index'] = sudden_index topic_index['coverage_index'] = coverage_index topic_index['media_index'] = media_index topic_index['leader_index'] = leader_index map_data = province_color_map(city_count) topic_info['topic_poster'] = topic_participents[0]['name'] topic_info['topic_post_date'] = date_list[0] topic_info['topic_leader_count'] = len(topic_leader) topic_info['topic_participents'] = len(topic_participents) topic_info['blogs_sum'] = blogs_sum topic_info['topic_ori_blog_count'] = len(topic_ori_blog) topic_info['topic_url'] = topic_url topic_info['perday_count_list'] = perday_count_list topic_info['date_list'] = date_list topic_info['topic_rel_blog'] = topic_rel_blog topic_info['geo'] = map_data topic_info['topic_leader'] = topic_leader topic_info['topic_working_list'] = work_list topic_info['topic_working_count'] = work_count topic_info['topic_index'] = topic_index topic_info['gt'] = gt topic_info['lt'] = lt return topic_info
# -*- coding: utf-8 -*- from xapian_weibo.xapian_backend_extra import Schema from xapian_weibo.xapian_backend import XapianSearch import leveldb import datetime import time import os LEVELDBPATH = '/home/mirage/leveldb' weibo_daily_sentiment_count_global_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'lijun_weibo_daily_sentiment_count_global'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) total_days = 90 emotions_kv = {'happy': 1, 'angry': 2, 'sad': 3} s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_sentiment', schema=Schema, schema_version=1) today = datetime.datetime.today() now_ts = time.mktime(datetime.datetime(today.year, today.month, today.day, 2, 0).timetuple()) now_ts = int(now_ts) during = 24 * 3600 for i in xrange(-total_days + 1, 1): begin_ts = now_ts + during * (i - 1) end_ts = now_ts + during * i print i, begin_ts, end_ts for emotion in emotions_kv.keys(): query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, 'sentiment': emotions_kv[emotion], }
import os from xapian_weibo.xapian_backend import XapianSearch from operator import itemgetter import datetime import time import leveldb LEVELDBPATH = '/home/mirage/leveldb' global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field_20131012'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) xapian_search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo') # search by index xapian_search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) # search by index fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology'] def readProtoUser(): protou = {} with open("/home/mirage/linhao/project_bishe/weibo/profile/user_classify/protou.txt") as f: for line in f: area = line.split(":")[0] if area not in protou: protou[area] = set() for u in line.split(":")[1].split(): protou[area].add(int(u)) return protou