def _load_weibos_from_xapian(): begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } s = XapianSearch(path='/opt/xapian_weibo/data/20130616/', name='master_timeline_weibo') count, get_results = s.search(query=query_dict, fields=['_id', 'user', 'text', 'timestamp']) print count return get_results
def _load_weibos_from_xapian(): begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, } s = XapianSearch(path='/opt/xapian_weibo/data/20130616/', name='master_timeline_weibo') count, get_results = s.search(query=query_dict, fields=['_id', 'user', 'text', 'timestamp']) print count return get_results
count, get_results = s.search(query=query_dict, fields=['user']) print count uids = set() for r in get_results(): uids.add(r['user']) print len(uids) """ print 'query5:' begin_ts1 = time.mktime(datetime.datetime(2013, 1, 1).timetuple()) query_dict = { 'timestamp': {'$gt': begin_ts1, '$lt': begin_ts1 + 3600}, } count, get_results = s.search(query=query_dict, fields=['terms']) print count print top_keywords(get_results, top=10) # 下面的用法由于接口的修改暂时没有维护, 但具有参考价值 """ print 'query2:' query_dict = {'$and': [{'text': [u'中国'], 'uid': 1217743083}, {'uid': 1217743083}, {'$or': [{'ts': {'gt': 0, 'lt': 1334450340}}, {'uid': 0000000000}]}], '$not': {'text': u'宝马', 'name': u'白之兔'}, 'name': u'袁岳' }
# -*- coding:utf-8 -*- import sys import calendar import datetime sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch s = XapianSearch(path='../data/', name='statuses') results = s.search(query={'text': [u'中国'], 'uid': 1217743083, 'ts': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-ts'], fields=['text', 'ts', 'name']) print 'query1:' for r in results['results']: print r['ts'] print 'hits: %s' % results['hits'] print 'query2:' query_dict = {'$and': [{'text': [u'中国'], 'uid': 1217743083}, {'uid': 1217743083}, {'$or': [{'ts': {'gt': 0, 'lt': 1334450340}}, {'uid': 0000000000}]}], '$not': {'text': u'宝马', 'name': u'白之兔'}, 'name': u'袁岳' } results = s.search(query=query_dict, sort_by=['-ts'], fields=['text', 'ts'])
mongo = _default_mongo(host='219.224.135.60', usedb='master_timeline') existed_file = open('2011_emotion_users_existed_20130615.txt', 'w') missing_file = open('2011_emotion_users_missing_20130615.txt', 'w') with open('/home/arthas/dev/scrapy_weibo/test/2011_emotion_users.txt') as f: missing = 0 not_exist = 0 per_page_missing = 30 iter_count = 0 for line in f: iter_count += 1 if iter_count % 100 == 0: print iter_count, missing, not_exist uid = line.split()[0] uid = int(uid) count = s.search(query={'user': uid}, count_only=True) r = mongo.master_timeline_user.find_one({'_id': uid}) if r: page = r['statuses_count'] / 100 if r['statuses_count'] % 100 > 0: page += 1 if r['statuses_count'] - count > page * per_page_missing and count > 0: missing += 1 missing_file.write('%s\n' % uid) elif r['statuses_count'] - count <= page * per_page_missing: existed_file.write('%s\n' % uid) if count == 0: not_exist += 1 missing_file.write('%s\n' % uid) else: