def _load_weibos_from_xapian():
    begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple())
    end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple())

    query_dict = {
        'timestamp': {'$gt': begin_ts, '$lt': end_ts},
    }

    s = XapianSearch(path='/opt/xapian_weibo/data/20130616/', name='master_timeline_weibo')
    count, get_results = s.search(query=query_dict, fields=['_id', 'user', 'text', 'timestamp'])
    print count
    return get_results
Exemplo n.º 2
0
def _load_weibos_from_xapian():
    begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple())
    end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple())

    query_dict = {
        'timestamp': {
            '$gt': begin_ts,
            '$lt': end_ts
        },
    }

    s = XapianSearch(path='/opt/xapian_weibo/data/20130616/',
                     name='master_timeline_weibo')
    count, get_results = s.search(query=query_dict,
                                  fields=['_id', 'user', 'text', 'timestamp'])
    print count
    return get_results
Exemplo n.º 3
0
count, get_results = s.search(query=query_dict, fields=['user'])
print count
uids = set()
for r in get_results():
    uids.add(r['user'])

print len(uids)
"""

print 'query5:'
begin_ts1 = time.mktime(datetime.datetime(2013, 1, 1).timetuple())

query_dict = {
    'timestamp': {'$gt': begin_ts1, '$lt': begin_ts1 + 3600},
}
count, get_results = s.search(query=query_dict, fields=['terms'])
print count
print top_keywords(get_results, top=10)

# 下面的用法由于接口的修改暂时没有维护, 但具有参考价值
"""
print 'query2:'
query_dict = {'$and': [{'text': [u'中国'], 'uid': 1217743083},
                       {'uid': 1217743083},
                       {'$or': [{'ts': {'gt': 0,
                                        'lt': 1334450340}},
                                {'uid': 0000000000}]}],
              '$not': {'text': u'宝马', 'name': u'白之兔'},
              'name': u'袁岳'
              }
Exemplo n.º 4
0
# -*- coding:utf-8 -*-

import sys
import  calendar
import datetime

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch

s = XapianSearch(path='../data/', name='statuses')

results = s.search(query={'text': [u'中国'], 'uid': 1217743083, 'ts': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-ts'], fields=['text', 'ts', 'name'])

print 'query1:'

for r in results['results']:
    print r['ts']

print 'hits: %s' % results['hits']

print 'query2:'
query_dict = {'$and': [{'text': [u'中国'], 'uid': 1217743083},
                       {'uid': 1217743083},
                       {'$or': [{'ts': {'gt': 0,
                                      'lt': 1334450340}},
                                {'uid': 0000000000}]}],
              '$not': {'text': u'宝马', 'name': u'白之兔'},
              'name': u'袁岳'
              }

results = s.search(query=query_dict, sort_by=['-ts'], fields=['text', 'ts'])
mongo = _default_mongo(host='219.224.135.60', usedb='master_timeline')

existed_file = open('2011_emotion_users_existed_20130615.txt', 'w')
missing_file = open('2011_emotion_users_missing_20130615.txt', 'w')
with open('/home/arthas/dev/scrapy_weibo/test/2011_emotion_users.txt') as f:
    missing = 0
    not_exist = 0
    per_page_missing = 30
    iter_count = 0
    for line in f:
        iter_count += 1
        if iter_count % 100 == 0:
            print iter_count, missing, not_exist
        uid = line.split()[0]
        uid = int(uid)
        count = s.search(query={'user': uid}, count_only=True)
        r = mongo.master_timeline_user.find_one({'_id': uid})
        if r:
            page = r['statuses_count'] / 100
            if r['statuses_count'] % 100 > 0:
                page += 1

            if r['statuses_count'] - count > page * per_page_missing and count > 0:
                missing += 1
                missing_file.write('%s\n' % uid)
            elif r['statuses_count'] - count <= page * per_page_missing:
                existed_file.write('%s\n' % uid)
            if count == 0:
                not_exist += 1
                missing_file.write('%s\n' % uid)
        else: