Пример #1
0
def cal_topic_sentiment_by_date(topic, datestr, duration):
    start_ts = datetime2ts(datestr)
    end_ts = start_ts + Day
    datestr = datestr.replace('-', '')
    xapian_search_weibo = getXapianWeiboByDate(datestr)
    if xapian_search_weibo:
        sentimentCronTopic(topic, xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
Пример #2
0
def read_xapian(date):
    # init leveldb
    print 'init leveldb'
    dailycount_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % date),
                                        block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

    
    print 'init xapian weibo'
    # init xapian weibo
    datestr = date.replace('-', '')
    xapian_search_weibo = getXapianWeiboByDate(datestr)

    if not xapian_search_weibo:
        return 'wrong'
    
    # iter
    weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'reposts_count', 'comments_count', 'attitudes_count'])

    # cal
    count = 0
    te = ts = time.time()
    for weibo in weibos:
        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts), 'identify weibo calc to leveldb ', date
            ts = te
        count += 1

        try:
            reposts_count = int(weibo['reposts_count'])
        except:
            reposts_count = 0

        try:
            comments_count = int(weibo['comments_count'])
        except:
            comments_count = 0

        try:
            attitudes_count = int(weibo['attitudes_count'])
        except:
            attitudes_count = 0

        uid = weibo['user']

        try:
            active, important, follower, domain = dailycount_bucket.Get(str(uid)).split('_')
            active = int(active)
            important = int(important)
            active += 1
            important += reposts_count + comments_count + attitudes_count
        except KeyError:
            active = 1
            important = reposts_count + comments_count + attitudes_count
            domain = user2domainFromLeveldb(uid)
            follower = user2FollowersCount(uid)

        dailycount_bucket.Put(str(uid), str(active) + '_' + str(important) + '_' + str(follower) + '_' + str(domain))
    
    return 'Done'
Пример #3
0
def cal_topic_sentiment_by_date(topic, datestr, duration):
    start_ts = datetime2ts(datestr)
    end_ts = start_ts + Fifteenminutes
    datestr = datestr.replace('-', '')
    xapian_search_weibo = getXapianWeiboByDate(datestr)
    if xapian_search_weibo:
        sentimentCronTopic(topic,
                           xapian_search_weibo,
                           start_ts=start_ts,
                           over_ts=end_ts,
                           during=duration)
def topic_not_calc():
    topics = _topic_not_calc()
    for topic in topics:
        query = topic.topic
    	end_ts = topic.end
    	during = topic.range
        
        start_ts = end_ts - during
        start_datestr = ts2datetime(start_ts)
        end_datestr = ts2datetime(end_ts)

        ts_1 = datetime2ts(start_datestr)
        ts_2 = datetime2ts(end_datestr)
        days = (ts_2 - ts_1) / Day

        date_list = []
        for i in range(0, days):
            datestr = datetime.date.fromtimestamp(ts_1 + i * Day).isoformat()
            date_list.append(datestr)
            datestr = datestr.replace('-', '')
            xapian_search_weibo = getXapianWeiboByDate(datestr)
            print xapian_search_weibo
            if xapian_search_weibo:
                sentimentRealTimeTopic(xapian_search_weibo, query, ts_1 + i * Day, ts_1 + (i + 1) * Day)
Пример #5
0
    now_datestr = sys.argv[1] # datestr as '20130921'
    daily_identify_aifd_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % now_datestr),
                                                 block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

    # check if xapian data is ready
    xapian_stub_file = '%s%s' % (DYNAMIC_XAPIAN_WEIBO_STUB_PATH, now_datestr)
    while 1:
        if os.path.isfile(xapian_stub_file):
            print '%s xapian data stub file is prepared' % now_datestr
            break
        else:
            print '%s xapian data stub file is not prepared' % now_datestr
            time.sleep(60)

    # init xapian weibo
    xapian_search_weibo = getXapianWeiboByDate(now_datestr)

    # init leveldb
    try:
        shutil.rmtree(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % now_datestr))
    except:
        pass
    daily_identify_aifd_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % now_datestr),
                                                 block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

    try:
        os.mkdir(os.path.join(LEVELDBPATH, 'linhao_user2followers_identify_r_%s' % now_datestr))
    except:
        pass
    copytree(os.path.join(LEVELDBPATH, 'yuanshi_daily_user_followers'), \
             os.path.join(LEVELDBPATH, 'linhao_user2followers_identify_r_%s' % now_datestr))
Пример #6
0
        print r['text'].encode('utf-8')
        print r['timestamp']
        print r['terms']

    print 'hits: %s' % count
else:
    print 'no results'
'''

#测试topics字段
datestr_list = ['20130902', '20130903', '20130904', \
                '20130905', '20130906', '20130907']
# datestr_list = ['20130907']
k = 0
for datestr in datestr_list:
    s = getXapianWeiboByDate(datestr)
    count, results = s.search(query={'topics': [u'东盟', u'博览会']}, fields=['text'])#fields=fields_list)
    '''
    f = open(datestr+'.txt', 'wb')
    for result in results():
        save_line = result['text'].encode('utf-8')
        f.write(save_line+'\n')
    '''
    print 'count:', count
    k = k+count
print 'all_count:', k

'''
stopic=u'中国'

query_dict = {
Пример #7
0
def cal_field_sentiment_by_date(domainid, datestr, duration):
    start_ts = datetime2ts(datestr)
    end_ts = start_ts + Day
    datestr = datestr.replace('-', '')
    xapian_search_weibo = getXapianWeiboByDate(datestr)
    sentiment_field(domainid, xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
Пример #8
0
def cal_sentiment_kcount_by_date(datestr, duration):
    start_ts = datetime2ts(datestr)
    end_ts = start_ts + Day
    datestr = datestr.replace('-', '')
    xapian_search_weibo = getXapianWeiboByDate(datestr)
    sentiment_keywords(xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
def sentimentRealTimeTopic(query, start_ts, end_ts, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, calc='all', w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT, sort_field=SORT_FIELD):
    if query and query != '':

    	start_ts = int(start_ts)
    	over_ts = int(end_ts)
    	
        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        for i in range(interval, 0, -1):

    	    emotions_count = {}
    	    emotions_kcount = {}
    	    emotions_weibo = {}

    	    begin_ts = over_ts - during * i
            xapian_datestr = datetime.date.fromtimestamp(begin_ts).isoformat()
            xapian_search_weibo = getXapianWeiboByDate(xapian_datestr.replace('-', ''))
            if not xapian_search_weibo:
                return

            end_ts = begin_ts + during
            print begin_ts, end_ts, 'topic realtime %s starts calculate' % query.encode('utf-8')

            query_dict = {
                'timestamp': {'$gt': begin_ts, '$lt': end_ts},
                '$or': []
            }

            for term in query.strip().split(','):
                if term:
                    query_dict['$or'].append({'text': [term]})

            if calc == 'all':
                for k, v in emotions_kv.iteritems():
                    query_dict['sentiment'] = v
                    scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                    mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \
                                                      max_offset=w_limit, mset_direct=True)
                    kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)
                    top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit)
                    emotions_count[v] = [end_ts, scount]
                    emotions_kcount[v] = [end_ts, kcount]
                    emotions_weibo[v] = [end_ts, top_ws]

                    print k, v, ', emotions count: ', emotions_count, ', keywords length: ', len(kcount), ', weibos count: ', len(top_ws)
                
                print 'save emotions count, keywords and weibo'
                save_rt_results('count', query, emotions_count, during)
                save_rt_results('kcount', query, emotions_kcount, during, klimit=k_limit)
                save_rt_results('weibos', query, emotions_weibo, during, wlimit=w_limit)
                
            elif calc == 'count':
                for k, v in emotions_kv.iteritems():
                    query_dict['sentiment'] = v
                    scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                    emotions_count[v] = [end_ts, scount]

                save_rt_results('count', query, emotions_count, during)
            
            else:
                if calc == 'kcount':
                    for k, v in emotions_kv.iteritems():
                        query_dict['sentiment'] = v
                        count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \
                                                                            sort_by=[SORT_FIELD], max_offset=w_limit)
                        kcount = top_keywords(get_results, top=k_limit)
                        emotions_kcount[v] = [end_ts, kcount]
                    
                    save_rt_results('kcount', query, emotions_kcount, during, TOP_KEYWORDS_LIMIT)

                if calc == 'weibos':
                    for k, v in emotions_kv.iteritems():
                        query_dict['sentiment'] = v
                        count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \
                                                                        sort_by=[sort_field], max_offset=w_limit)
                        top_ws = top_weibos(get_results, top=w_limit)
                        emotions_weibo[v] = [end_ts, top_ws]
                    
                    save_rt_results('weibos', query, emotions_weibo, during, TOP_WEIBOS_LIMIT)
Пример #10
0
        terms = cut(scws, _utf_encode(text), f='n')
        for term in terms:
            try:
                kcount = int(daily_profile_domain_keywords_db.Get(str(term)))
                daily_profile_domain_keywords_db.Put(str(term), str(kcount + 1))
            except KeyError:
                daily_profile_domain_keywords_db.Put(str(term), str(1))

        count += 1


if __name__ == '__main__':
    # init xapian weibo
    import sys
    batch_date_1 = sys.argv[1] # '20130905'
    xapian_search_weibo = getXapianWeiboByDate(batch_date_1)

    #
    seed_set = get_official_seed_set()
    scws = load_scws()
    
    # update person basics once a week
    # sharding = False
    # if sharding:
    #    # mysqldb连接数据库 
    #    try:
    #        cobar_conn = MySQLdb.connect(host=COBAR_HOST, user=COBAR_USER, db='cobar_db_weibo', port=COBAR_PORT, charset='utf8')
    #        print 'connection success'
    #    except Exception, e:
    #        print e
    #        sys.exit()