def acquire_user_by_id(uid): XAPIAN_USER_DATA_PATH = "/home/xapian/xapian_user/" user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name="master_timeline_user", schema_version=1) result = user_search.search_by_id(int(uid), fields=["name", "location", "followers_count", "friends_count"]) user = {} if result: user["name"] = result["name"] user["location"] = result["location"] user["count1"] = result["followers_count"] user["count2"] = result["friends_count"] return user
def acquire_user_by_id(uid): user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) result = user_search.search_by_id(int(uid), fields=['name', 'location', 'followers_count', 'friends_count', 'profile_image_url']) user = {} if result: user['name'] = result['name'] user['location'] = result['location'] user['followers_count'] = result['followers_count'] user['friends_count'] = result['friends_count'] user['profile_image_url'] = result['profile_image_url'] else: return None return user
def acquire_user_by_id(uid): XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/' user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) result = user_search.search_by_id( int(uid), fields=['name', 'location', 'followers_count', 'friends_count']) user = {} if result: user['name'] = result['name'] user['location'] = result['location'] user['count1'] = result['followers_count'] user['count2'] = result['friends_count'] return user
def getXapianWeiboByDate(datestr): # datestr: 20130908 stub_file = path + datestr if os.path.exists(stub_file): xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: return None
def getXapianWeiboByTopic(topic_id='545f4c22cf198b18c57b8014'): stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str( topic_id) if os.path.exists(stub_file): xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: return None
def getXapianWeiboByTopic(topic_id='54635178e74050a373a1b939'): stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str( topic_id) if os.path.exists(stub_file): print 'stub exist' xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: print 'stub not exist' return None
def getXapianWeiboByTopic(topic_id='54ccbfab5a220134d9f7fc1b37'): stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str( topic_id) if os.path.exists(stub_file): print 'stub exist' xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: print 'stub not exist' return None
def getXapianWeiboByTopic(topic): stub_file = '/home/ubuntu4/ljh/csv/stub/master_timeline_weibo_topic' print stub_file if os.path.exists(stub_file): print 'stub exist' xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: print 'stub not exist' return None
def getXapianWeiboByDate(datestr): # datestr: 20130908 stub_file = path + datestr print stub_file if os.path.exists(stub_file): print 'step--stub exist' xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: print 'stub not exist' return None
def getXapianWeiboByTopic(topic, start_ts, end_ts): topic_id = topic2xapian(topic, start_ts, end_ts) XAPIAN_WEIBO_TOPIC_DATA_PATH = '/home/xapian/xapian_weibo_topic/' stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str(topic_id) if os.path.exists(stub_file): print 'stub exist' xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5') return xapian_search_weibo else: print 'stub not exist' return None
def acquire_user_by_id(uid): user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) result = user_search.search_by_id(int(uid), fields=[ 'name', 'location', 'followers_count', 'friends_count', 'profile_image_url' ]) user = {} if result: user['name'] = result['name'] user['location'] = result['location'] user['followers_count'] = result['followers_count'] user['friends_count'] = result['friends_count'] user['profile_image_url'] = result['profile_image_url'] else: return None return user
def getXapianWeiboByDuration(datestr_list): stub_file_list = [] for datestr in datestr_list: stub_file = path + datestr if os.path.exists(stub_file): stub_file_list.append(stub_file) if len(stub_file_list): xapian_search_weibo = XapianSearch(stub=stub_file_list, include_remote=True, schema_version='5') return xapian_search_weibo else: return None
'university', 'homeadmin', 'abroadadmin', 'homemedia', 'abroadmedia', 'folkorg', \ 'lawyer', 'politician', 'mediaworker', 'activer', 'grassroot', 'other'] DOMAIN_ZH_LIST = [u'文化', u'教育', u'娱乐', u'时尚', u'财经', u'媒体', u'体育', u'科技', u'境外', \ u'高校微博', u'境内机构', u'境外机构', u'境内媒体', u'境外媒体', u'民间组织', u'律师', \ u'政府官员', u'媒体人士', u'活跃人士', u'草根', u'其它'] MYSQL_HOST = '219.224.135.47' MYSQL_USER = '******' MYSQL_DB = 'weibocase' MONGODB_HOST = '219.224.135.47' MONGODB_PORT = 27019 SSDB_PORT = 8888 SSDB_HOST = '219.224.135.47' # SSDB服务器在47 XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/' XAPIAN_WEIBO_TOPIC_DATA_PATH = '/home/xapian/xapian_weibo_topic/' xapian_search_user = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) API_HOST = '219.224.135.47' API_PORT = 9115 MASTER_TIMELINE_54API_MONGOD_HOST = '219.224.135.47' MASTER_TIMELINE_54API_MONGOD_PORT = 27019 MASTER_TIMELINE_54API_WEIBO_DB = '54api_weibo_v2' MASTER_TIMELINE_54API_USER_COLLECTION = 'master_timeline_user' MASTER_TIMELINE_54API_WEIBO_DAILY_COLLECTION_PREFIX = 'master_timeline_weibo_weekly_' MASTER_TIMELINE_54API_WEIBO_TOPIC_COLLECTION_PREFIX = 'master_timeline_weibo_topic_' MASTER_TIMELINE_54API_TOPIC_COLLECTION = 'master_timeline_topic' MASTER_TIMELINE_54API_WEIBO_REPOST_COLLECTION = 'master_timeline_weibo_repost'
MINUTE = 60 FIFTEENMINUTES = 15 * MINUTE HOUR = 3600 SIXHOURS = 6 * HOUR DAY = 24 * HOUR INTERVAL = TENSECONDS REDIS_HOST = '219.224.135.48' REDIS_PORT = 6379 USER_DOMAIN = 'user_domain' # user domain hash BEGIN_TS = time.mktime(datetime.datetime(2013, 9, 1, 0, 0, 0).timetuple()) END_TS = time.mktime(datetime.datetime(2013, 9, 1, 0, 1, 0).timetuple()) s = XapianSearch(stub = PATH, schema_version = '5') def _default_redis(host = REDIS_HOST, port = REDIS_PORT, db = 0): return redis.StrictRedis(host, port, db) r = _default_redis() def cron_index_topic(topic, begin_ts = BEGIN_TS, end_ts = END_TS): if topic and topic != '': topic = topic.strip() query_dict = { 'timestamp':{'$gt':begin_ts,'$lt':end_ts}, 'topics':topic } count,results = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS)
# -*- coding:utf-8 -*- import sys import time import datetime sys.path.append('../xapian_case') from xapian_case.xapian_backend import XapianSearch from xapian_case.utils import top_keywords, not_low_freq_keywords, gen_mset_iter # 默认schema_version为2 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/data/20140724/20140724/', name='master_timeline_weibo', schema_version='5') # import和初始化, 请使用下面的用法 # from xapian_weibo.xapian_backend import XapianSearch # s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo') # 查询条件有user(id),text,topic,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序) # 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp # 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词 # 若fields参数不指定,或者为None,则返回所有字段,除terms之外 # 如果需要返回terms,请一一指定需要的字段,并包括terms # 简单示例如下 ''' count, get_results = s.search(query={'text': [u'男士']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id']) print 'query1:' #根据text查询 if count!=0: for r in get_results():
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等 data = xlrd.open_workbook(excel_name) weibos_dict = {} for i in child_topic_list: #if i == '0': # continue weibos_dict[i] = [] table_weibos = data.sheet_by_name(str(int(i))) n_row_weibos = table_weibos.nrows if n_row_weibos <= w_limit: n_rows = n_row_weibo else: n_rows = w_limit # 考虑到数据已经根据权重从大到小排列 for j in range(n_rows): line = table_weibos.row_values(j) # 缺少根据文本查询微博文本对应的其他微博内容 weibo_text = line[1] weibo_weight = line[0] try: weibos_dict[i].append((weibo_text, weibo_weight)) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的 except: weibos_dict[i]=[(weibo_text, weibo_weight)] #print 'weibos_dict:', weibos_dict #获取微博具体数据,仅作测试用 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo',schema_version='5') begin_ts = 1378050300 end_ts = 1378051200 query_dict = { 'timestamp': {'$gt':begin_ts, '$lt': end_ts}, 'message_type' : 2 } weibos_dict_new = {} scount, weibo_results =s.search(query=query_dict, fields=fields_list) #print 'scount:', scount i = 0 j = 0 for weibo in weibo_results(): if i==11: break weibo['text'] = weibos_dict[str(i)][j][0] #获取username,profileimage,weibourl username, profileimage = getuserinfo(weibo['user']) weibo['username'] = username weibo['profile_image_url'] = profileimage weibo['timestamp'] = ts2date(weibo['timestamp']) weibo['weibo_link'] = weiboinfo2url(weibo['user'],weibo['_id']) #获取username, profileimage,weibourl结束 weight = weibos_dict[str(i)][j][1] try: weibos_dict_new[i].append((weibo, weight)) except: weibos_dict_new[i] = [(weibo, weight)] if j==4: j = 0 i += 1 else: j +=1 #分割线 for i in range(len(child_topic_list)): item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i])) item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \ OpinionTestWeibos.child_topic==i).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()
# -*- coding:utf-8 -*- import sys import time import datetime sys.path.append('../xapian_case') from xapian_case.xapian_backend import XapianSearch from xapian_case.utils import top_keywords, not_low_freq_keywords, gen_mset_iter # 默认schema_version为2 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/data/20140724/20140724/', name='master_timeline_weibo',schema_version='5') # import和初始化, 请使用下面的用法 # from xapian_weibo.xapian_backend import XapianSearch # s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo') # 查询条件有user(id),text,topic,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序) # 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp # 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词 # 若fields参数不指定,或者为None,则返回所有字段,除terms之外 # 如果需要返回terms,请一一指定需要的字段,并包括terms # 简单示例如下 ''' count, get_results = s.search(query={'text': [u'男士']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id']) print 'query1:' #根据text查询 if count!=0: for r in get_results(): print "** " * 10
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等 data = xlrd.open_workbook(excel_name) weibos_dict = {} for i in child_topic_list: #if i == '0': # continue weibos_dict[i] = [] table_weibos = data.sheet_by_name(str(int(i))) n_row_weibos = table_weibos.nrows if n_row_weibos <= w_limit: n_rows = n_row_weibo else: n_rows = w_limit # 考虑到数据已经根据权重从大到小排列 for j in range(n_rows): line = table_weibos.row_values(j) # 缺少根据文本查询微博文本对应的其他微博内容 weibo_text = line[1] weibo_weight = line[0] try: weibos_dict[i].append( (weibo_text, weibo_weight )) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的 except: weibos_dict[i] = [(weibo_text, weibo_weight)] #print 'weibos_dict:', weibos_dict #获取微博具体数据,仅作测试用 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo', schema_version='5') begin_ts = 1378050300 end_ts = 1378051200 query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, 'message_type': 2 } weibos_dict_new = {} scount, weibo_results = s.search(query=query_dict, fields=fields_list) #print 'scount:', scount i = 0 j = 0 for weibo in weibo_results(): if i == 11: break weibo['text'] = weibos_dict[str(i)][j][0] #获取username,profileimage,weibourl username, profileimage = getuserinfo(weibo['user']) weibo['username'] = username weibo['profile_image_url'] = profileimage weibo['timestamp'] = ts2date(weibo['timestamp']) weibo['weibo_link'] = weiboinfo2url(weibo['user'], weibo['_id']) #获取username, profileimage,weibourl结束 weight = weibos_dict[str(i)][j][1] try: weibos_dict_new[i].append((weibo, weight)) except: weibos_dict_new[i] = [(weibo, weight)] if j == 4: j = 0 i += 1 else: j += 1 #分割线 for i in range(len(child_topic_list)): item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i])) item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \ OpinionTestWeibos.child_topic==i).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()
# -*- coding:utf-8 -*- import sys import time import datetime sys.path.append('../xapian_case') from xapian_case.xapian_backend import XapianSearch from xapian_case.utils import top_keywords, not_low_freq_keywords, gen_mset_iter # 默认schema_version为2 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140724/', name='master_timeline_weibo',schema_version='5') #uesr ''' count, get_results = s.search(query={'user': 1811093512}, fields=['text', 'timestamp', 'user', 'terms', '_id']) print 'query1:' if count!=0: for r in get_results(): print "** " * 10 print r['_id'] print r['user'] print r['text'] print r['timestamp'] print r['terms'] print 'hits: %s' % count else: print 'no results' ''' get_results = s.iter_all_docs(fields=['_id', 'user', 'retweeted_uid', 'retweeted_mid', 'text', 'timestamp', 'reposts_count', 'source', 'bmiddle_pic', 'geo', 'attitudes_count', 'comments_count', 'sentiment', 'topics', 'message_type', 'terms' ]) for r in get_results: