def _load_weibos_from_xapian(): begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } s = XapianSearch(path='/opt/xapian_weibo/data/20130616/', name='master_timeline_weibo') count, get_results = s.search(query=query_dict, fields=['_id', 'user', 'text', 'timestamp']) print count return get_results
def _load_weibos_from_xapian(): begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, } s = XapianSearch(path='/opt/xapian_weibo/data/20130616/', name='master_timeline_weibo') count, get_results = s.search(query=query_dict, fields=['_id', 'user', 'text', 'timestamp']) print count return get_results
print r['terms'] print 'hits: %s' % count stub = '/home/arthas/dev/xapian_weibo/stub/master_timeline_weibo_20130929' s = XapianSearch(stub=stub, include_remote=True) count, get_results = s.search(query={'text': [u'中国']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id']) print 'query2:' for r in get_results(): print "** " * 10 print r['_id'] print r['user'] print r['text'] print r['timestamp'] print r['terms'] print 'hits: %s' % count """ print "query3:" stub = '/home/arthas/dev/xapian_weibo/stub/master_timeline_weibo_20130929' s = XapianSearch(stub=stub, include_remote=True) results = s.iter_all_docs() count = 0 for r in results: count += 1 print 'hits: ', count
# -*- coding:utf-8 -*- import sys import time import datetime sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch from utils import top_keywords, not_low_freq_keywords # 默认schema_version为2 s = XapianSearch(path='../data/', name='master_timeline_weibo') # import和初始化, 请使用下面的用法 # from xapian_weibo.xapian_backend import XapianSearch # s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo') # 查询条件有user(id),retweeted_status(id),text,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序) # 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp # 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词 # 若fields参数不指定,或者为None,则返回所有字段,除terms之外 # 如果需要返回terms,请一一指定需要的字段,并包括terms # 简单示例如下 """ count, get_results = s.search(query={'text': [u'中国'], 'user': 1217743083, 'timestamp': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id']) print 'query1:' for r in get_results(): print "** " * 10 print r['_id'] print r['user']
# -*- coding:utf-8 -*- import sys sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch s = XapianSearch(path='../data/', name='statuses') query_dict1 = { '$and': [{ 'text': '1', 'uid': '2' }], '$not': { 'name': '3', 'text': '4', }, 'name': '5', } print s.build_query_tree(query_dict1) print s.parse_query(query_dict1) query_dict2 = { '$and': [{ 'text': '1', 'ts': { '$gt': 0, '$lt': 1 }
# -*- coding:utf-8 -*- import sys import time import datetime sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch from utils import top_keywords, not_low_freq_keywords, gen_mset_iter # 默认schema_version为2 s = XapianSearch(path='/opt/xapian_weibo/data/20131207/', name='master_timeline_weibo') # import和初始化, 请使用下面的用法 # from xapian_weibo.xapian_backend import XapianSearch # s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo') # 查询条件有user(id),retweeted_status(id),text,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序) # 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp # 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词 # 若fields参数不指定,或者为None,则返回所有字段,除terms之外 # 如果需要返回terms,请一一指定需要的字段,并包括terms # 简单示例如下 """ count, get_results = s.search(query={'text': [u'中国'], 'user': 1217743083, 'timestamp': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id']) print 'query1:' for r in get_results(): print "** " * 10 print r['_id']
# -*- coding:utf-8 -*- import sys import calendar import datetime sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch s = XapianSearch(path='../data/', name='statuses') results = s.search(query={'text': [u'中国'], 'uid': 1217743083, 'ts': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-ts'], fields=['text', 'ts', 'name']) print 'query1:' for r in results['results']: print r['ts'] print 'hits: %s' % results['hits'] print 'query2:' query_dict = {'$and': [{'text': [u'中国'], 'uid': 1217743083}, {'uid': 1217743083}, {'$or': [{'ts': {'gt': 0, 'lt': 1334450340}}, {'uid': 0000000000}]}], '$not': {'text': u'宝马', 'name': u'白之兔'}, 'name': u'袁岳' } results = s.search(query=query_dict, sort_by=['-ts'], fields=['text', 'ts'])
# -*- coding:utf-8 -*- import sys sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch s = XapianSearch(path='../data/', name='statuses') query_dict1 = { '$and': [{'text': '1', 'uid': '2'}], '$not': { 'name': '3', 'text': '4', }, 'name': '5', } print s.build_query_tree(query_dict1) print s.parse_query(query_dict1) query_dict2 = { '$and': [{'text': '1', 'ts': {'$gt': 0, '$lt': 1}}, {'$or': [{'uid': 3}, {'uid': 4}]}], '$not': { 'name': '3', 'text': '4', }, 'name': '5',
# -*- coding:utf-8 -*- import sys sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch from utils4scrapy.tk_maintain import _default_mongo # 默认schema_version为2 s = XapianSearch(path='../data/', name='master_timeline_weibo') mongo = _default_mongo(host='219.224.135.60', usedb='master_timeline') existed_file = open('2011_emotion_users_existed_20130615.txt', 'w') missing_file = open('2011_emotion_users_missing_20130615.txt', 'w') with open('/home/arthas/dev/scrapy_weibo/test/2011_emotion_users.txt') as f: missing = 0 not_exist = 0 per_page_missing = 30 iter_count = 0 for line in f: iter_count += 1 if iter_count % 100 == 0: print iter_count, missing, not_exist uid = line.split()[0] uid = int(uid) count = s.search(query={'user': uid}, count_only=True) r = mongo.master_timeline_user.find_one({'_id': uid}) if r: page = r['statuses_count'] / 100 if r['statuses_count'] % 100 > 0: page += 1