Пример #1
0
def save_trend_maker(topic, date, windowsize, trend_maker,topic_xapian_id):
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理
    makers = trend_maker
    rank = 0
    user_exist_list = []
    items_exist = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\
                                                      TrendMaker.date==date ,\
                                                      TrendMaker.windowsize==windowsize).all()
    for item_exist in items_exist:
        db.session.delete(item_exist)
    db.session.commit()
    for maker in makers:
        uid = maker[0]
        if uid in user_exist_list:
            continue
        user_exist_list.append(uid)
        if rank>=trend_maker_count:
            break
        rank += 1
        wid = maker[1]
        value = maker[2] #内容相关度---关键词命中个数
        key_item = maker[3] # 命中的关键词 
        user_info = get_user_info(uid)
        weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list)
        #print 'trend_maker weibo_info:', weibo_info
        domain = uid2domain(uid)
        timestamp = int(weibo_info['timestamp'])
        # 修改model
        item = TrendMaker(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank,value, json.dumps(key_item))
        db.session.add(item)
    db.session.commit()
    print 'save_trend_maker success'
Пример #2
0
def save_trend_pusher(topic, date, windowsize, trend_pusher, topic_xapian_id):
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理
    pushers = trend_pusher
    rank = 0
    user_exist_list = []
    items_exist = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\
                                                       TrendPusher.date==date ,\
                                                       TrendPusher.windowsize==windowsize).all()
    for item_exist in items_exist:
        db.session.delete(item_exist)
    db.session.commit()
    for pusher in pushers:
        uid = pusher[0]
        if uid in user_exist_list:
            continue
        user_exist_list.append(uid)
        if rank>=trend_pusher_count:
            break
        rank += 1
        wid = pusher[1]
        user_info = get_user_info(uid)
        weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list)
        domain = uid2domain(uid)
        timestamp = int(weibo_info['timestamp'])
        item = TrendPusher(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank)
        db.session.add(item)
    db.session.commit()
    print 'save_trend_pusher success'
Пример #3
0
def main(topic, start_time, end_time):
    start_ts = datetime2ts(start_time)
    end_ts = datetime2ts(end_time) + 24 * 3600
    ##    datestrlist = []
    ##    for datestr in datestr_list:
    ##        datestr_new = datestr.replace('-', '')
    ##        datestrlist.append(datestr_new)
    query_dict = {
        'timestamp': {
            '$gt': start_ts,
            '$lt': end_ts
        },
    }
    ##    t = topic.split(',')
    ##    for ctopic in t:
    ##        query_dict['$and'].append({'topics': ctopic})
    start = time.time()
    ##    statuses_search = getXapianWeiboByDuration(datestrlist)
    ##    count, get_results = statuses_search.search(query=query_dict, fields=fields_list)
    topic_id = getTopicByName(topic)['_id']
    xapian_search_weibo = getXapianWeiboByTopic(topic_id)
    count, get_results = xapian_search_weibo.search(query=query_dict,
                                                    fields=fields_list)
    end = time.time()

    #print count
    print 'search takes %s s' % (end - start)
    weibo = []
    for r in get_results():
        weibo.append([
            r['_id'], r['user'], r['text'].encode('utf-8'), r['timestamp'],
            r['reposts_count'], r['comments_count']
        ])

    ad_main(topic, weibo, '0914', 10)  #开始进行微博数据的观点挖掘
Пример #4
0
def save_trend_pusher(topic, date, windowsize, trend_pusher, topic_xapian_id):
    xapian_search_weibo = getXapianWeiboByTopic(
        topic_xapian_id)  # topic id 要做一下处理
    pushers = trend_pusher
    rank = 0
    user_exist_list = []
    items_exist = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\
                                                       TrendPusher.date==date ,\
                                                       TrendPusher.windowsize==windowsize).all()
    for item_exist in items_exist:
        db.session.delete(item_exist)
    db.session.commit()
    for pusher in pushers:
        uid = pusher[0]
        if uid in user_exist_list:
            continue
        user_exist_list.append(uid)
        if rank >= trend_pusher_count:
            break
        rank += 1
        wid = pusher[1]
        user_info = get_user_info(uid)
        weibo_info = xapian_search_weibo.search_by_id(wid,
                                                      fields=weibo_fields_list)
        domain = uid2domain(uid)
        timestamp = int(weibo_info['timestamp'])
        item = TrendPusher(topic, date, windowsize, uid, timestamp,
                           json.dumps(user_info), json.dumps(weibo_info),
                           domain, rank)
        db.session.add(item)
    db.session.commit()
    print 'save_trend_pusher success'
Пример #5
0
def get_ds_info(text, userid, topic, timestamp_add, DEFAULT_INTERVAL,
                topic_xapian_id):  # timestamp_add 表示最终极转发用户发表微博的时间戳
    direct_superior_info = {}

    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    query_dict = {'user': userid, 'text': text}
    count, result = xapian_search_weibo.search(query=query_dict,
                                               fields=[
                                                   'timestamp',
                                                   'comments_count',
                                                   'attitude_count',
                                                   'reposts_count',
                                                   'retweeted_uid'
                                               ])  # result是一个生成器
    if result:
        for rr in result():
            direct_superior_info = rr
    else:
        direct_superior_info['timestamp'] = DEFAULT_INTERVAL + timestamp_add
        direct_superior_info['comments_count'] = u'未知'
        direct_superior_info['attitude_count'] = u'未知'
        direct_superior_info['reposts_count'] = u'未知'
        direct_superior_info['retweeted_uid'] = None

    return direct_superior_info
Пример #6
0
def sort_makers(keyword_data, begin_ts, end_ts, ts_list, topic_xapian_id):
    '''
    if begin_ts == ts_list[0]:
        start = begin_ts + 2 * Day
        end = begin_ts + 2 * Day + 12 * Hour
    else:
        start = begin_ts + 2 * Day - 6 * Hour
        end = begin_ts + 2 * Day + 12 * Hour
    query_dict = {
        'timestamp' : {'$gt':start, '$lt':end}
        }
    print 'sort_maker-query_dict:', query_dict
    xapian_search_weibo = getXapianWeiboByTopic(topic_id='545f4c22cf198b18c57b8014')
    count , search_weibos = xapian_search_weibo.search(query=query_dict, fields=field_list)
    print 'sort_makers:', count
    if count == 0:
        return []
    weibo_term = {}
    for weibo in search_weibos():
        uid = weibo['user']
        wid = weibo['_id']
        terms_list = weibo['terms']
        key_term_count = 0
        for term in terms_list:
            term = term.decode('utf-8')
            #print 'term:', term, type(term)
            #print 'keyword_data:', keyword_data[0], type(keyword_data[0])
            if term in keyword_data:
                key_term_count += 1
        weibo_term[uid] = [wid, key_term_count] 
    sort_weibo_term = sorted(weibo_term.items(), key=lambda x:x[1][1], reverse=True)
    '''
    begin_ts = begin_ts - Hour
    query_dict = {'timestamp':{'$gt': begin_ts, '$lt': end_ts}}
    xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id)
    count, search_weibo = xapian_search_weibo.search(query=query_dict, sort_by=['-timestamp'], fields=field_list)
    num = 0
    if count == 0:
        return []
    weibo_term = {}
    for weibo in search_weibo():
        num += 1
        if num > fu_tr_top_keyword:
            break
        uid = weibo['user']
        wid = weibo['_id']
        terms_list = weibo['terms']
        key_term_count = 0
        key_term = []
        for term in terms_list:
            term = term.decode('utf-8')
            if term in keyword_data:
                key_term_count += 1
                key_term.append(term)
        weibo_term[uid] = [wid, key_term_count, key_term]
    sort_weibo_term = sorted(weibo_term.items(), key=lambda x:x[1][1], reverse=True)
    return sort_weibo_term[:fu_tr_top_keyword]
Пример #7
0
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id):
    #print 'new_peaks:', new_peaks
    #print 'new_bottom:', new_bottom
    #print 'ts_list:', ts_list
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    query_dict = {
        'timestamp': {
            '$gt': begin_ts,
            '$lt': end_ts
        },
        'message_type': 3
    }
    print 'query_dict:', query_dict
    print 'begin_ts:', ts2date(begin_ts)
    print 'end_ts:', ts2date(end_ts)
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)  # 这里需要考虑话题id
    count, results = xapian_search_weibo.search(
        query=query_dict, fields=['retweeted_uid', 'retweeted_mid'])
    print 'count:', count
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_result = sorted(ruid_count.items(),
                           key=lambda d: d[1],
                           reverse=True)
    print 'top_source_user:'******'''
    count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp'])
    print 'count:', count
    for i in top_weibo():
        timestamp = i['timestamp']
        print 'timestamp:', ts2date(int(timestamp))
    '''
    return sorted_result
Пример #8
0
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id):
    #print 'new_peaks:', new_peaks
    #print 'new_bottom:', new_bottom
    #print 'ts_list:', ts_list
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    if begin_ts>end_ts:
        begin_ts = ts_list[0]
    query_dict = {
        'timestamp':{'$gt':begin_ts, '$lt':end_ts},
        'message_type':3
        }
    print 'query_dict:', query_dict
    print 'begin_ts:', ts2date(begin_ts)
    print 'end_ts:', ts2date(end_ts)
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)# 这里需要考虑话题id
    count, results = xapian_search_weibo.search(query=query_dict, fields=['retweeted_uid','retweeted_mid'])
    print 'count:', count
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_result = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True)
    print 'top_source_user:'******'''
    count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp'])
    print 'count:', count
    for i in top_weibo():
        timestamp = i['timestamp']
        print 'timestamp:', ts2date(int(timestamp))
    '''    
    return sorted_result
Пример #9
0
def get_ds_info(text, userid, topic, timestamp_add, DEFAULT_INTERVAL, topic_xapian_id): # timestamp_add 表示最终极转发用户发表微博的时间戳
    direct_superior_info = {}

    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    query_dict = {
        'user': userid ,
        'text': text
        }
    count, result = xapian_search_weibo.search(query=query_dict, fields=['timestamp', 'comments_count', 'attitude_count','reposts_count', 'retweeted_uid']) # result是一个生成器
    if result:
        for rr in result():
            direct_superior_info = rr
    else:
        direct_superior_info['timestamp'] = DEFAULT_INTERVAL + timestamp_add
        direct_superior_info['comments_count'] = u'未知'
        direct_superior_info['attitude_count'] = u'未知'
        direct_superior_info['reposts_count'] = u'未知'
        direct_superior_info['retweeted_uid'] = None

    return direct_superior_info
Пример #10
0
def save_trend_maker(topic, date, windowsize, trend_maker, topic_xapian_id):
    xapian_search_weibo = getXapianWeiboByTopic(
        topic_xapian_id)  # topic id 要做一下处理
    makers = trend_maker
    rank = 0
    user_exist_list = []
    items_exist = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\
                                                      TrendMaker.date==date ,\
                                                      TrendMaker.windowsize==windowsize).all()
    for item_exist in items_exist:
        db.session.delete(item_exist)
    db.session.commit()
    for maker in makers:
        uid = maker[0]
        if uid in user_exist_list:
            continue
        user_exist_list.append(uid)
        if rank >= trend_maker_count:
            break
        rank += 1
        wid = maker[1]
        value = maker[2]  #内容相关度---关键词命中个数
        key_item = maker[3]  # 命中的关键词
        user_info = get_user_info(uid)
        weibo_info = xapian_search_weibo.search_by_id(wid,
                                                      fields=weibo_fields_list)
        #print 'trend_maker weibo_info:', weibo_info
        domain = uid2domain(uid)
        timestamp = int(weibo_info['timestamp'])
        # 修改model
        item = TrendMaker(topic, date, windowsize, uid, timestamp,
                          json.dumps(user_info), json.dumps(weibo_info),
                          domain, rank, value, json.dumps(key_item))
        db.session.add(item)
    db.session.commit()
    print 'save_trend_maker success'
Пример #11
0
def cal_topic_quotasystem_count_by_date(topic, start, end):
    #确定要查询Weibo的时间段
    start_date = ts2datetime(start)
    end_date = ts2datetime(end) # 若结束时间戳为2014:09:02 00:00:00,实际上还是算在9.1那一天中
    print 'start, end:', start_date, end_date
    windowsize = (end - start) / Day
    print 'windowsize:', windowsize
    datestr_list = []
    for i in range(windowsize):
        time = start + i * Day
        time_date = ts2datetime(time)
        datestr_list.append(time_date.replace('-', ''))
    print 'datestr_list:', datestr_list
    # 
    topic_xapian_id = weibo_topic2xapian(topic, start, end)
    print 'topic_xapian_id:', topic_xapian_id
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    '''
    xapian_search_weibo = getXapianWeiboByDuration(datestr_list) # 这里是根据时间段进行查询的
    xapian_search_topic = getXapianWeiboByTopic(topic) # 直接查topic建立的索引
    '''
    if xapian_search_weibo:
        print '******start_compute'
        quota_attention(topic, xapian_search_weibo, start_ts=start, end_ts=end)
        quota_duration(topic, start_ts=start, end_ts=end)
        print 'save duration success'
        quota_sensitivity(topic, start_ts=start, end_ts=end)
        print 'save sensitivity success'
        quota_importance(topic, start_ts=start, end_ts=end)
        print 'save importance success'
        quota_sentiment(topic, xapian_search_weibo, start_ts=start, end_ts=end)
        print 'save sentiment success'
        quota_coverage(topic, xapian_search_weibo, start_ts=start, end_ts=end) # 覆盖度计算
        print 'save coverage success'
        quota_person_sensitivity(topic, xapian_search_weibo, start_ts=start, end_ts=end) # 敏感人物参与度
        print 'save person_sensitivity success'
Пример #12
0
                mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \
                                                  max_offset=w_limit, mset_direct=True)

                kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)
                top_ws = top_weibos(results, top=w_limit)

                mtype_count[v] = [end_ts, count]
                mtype_kcount[v] = [end_ts, kcount]
                mtype_weibo[v] = [end_ts, top_ws]

            save_pc_results(topic, mtype_count, during)
            save_kc_results(topic, mtype_kcount, during, k_limit)
            save_ws_results(topic, mtype_weibo, during, w_limit)


if __name__ == '__main__':
    topic = sys.argv[1] # u'香港自由行' u'张灵甫遗骨疑似被埋羊圈' u'高校思想宣传' u'高校宣传思想工作' u'外滩踩踏' 'APEC' u'全军政治工作会议'
    start_date = sys.argv[2] # '2015-02-23'
    end_date = sys.argv[3] # '2015-03-02'

    topic = topic.decode('utf-8')
    topic_id = getTopicByName(topic)['_id']
    start_ts = datetime2ts(start_date)
    end_ts = datetime2ts(end_date)

    duration = Fifteenminutes
    xapian_search_weibo = getXapianWeiboByTopic(topic_id)

    print 'topic: ', topic.encode('utf8'), 'from %s to %s' % (start_ts, end_ts)
    propagateCronTopic(topic, xapian_search_weibo, start_ts, end_ts, during=duration)
Пример #13
0
def make_network(topic, date, window_size, topic_xapian_id, max_size=100000, attribute_add = False):
    topics = topic.strip().split(',')
    end_time = int(datetime2ts(date))
    start_time = int(end_time - window2time(window_size))
    print 'start, end:', start_time, end_time
    #topic_id='545f4c22cf198b18c57b8014'
    topic_id = topic_xapian_id
    statuses_search = getXapianWeiboByTopic(topic_id)
    '''
    count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}})
    for i in test_results():
        print i
        break
    '''

    g = nx.DiGraph() # 初始化一个有向图
    gg = nx.Graph() # 为计算quota初始化一个无向图
    ds_dg = nx.DiGraph() # direct_superior_network 有向直接上级转发网络
    ds_udg = nx.Graph() # undirect_superior_network 无向直接上级转发网络
    query_dict = {'timestamp':{'$gt': start_time, '$lt': end_time}}
    #need repost index
    ''' 
    for ctopic in topics:
        query_dict['topics'].append(ctopic)
    '''
    print 'query_dict:', query_dict
    count, get_statuses_results = statuses_search.search(query=query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'], max_offset=max_size)
    print 'count_before_nad:', count
    results_list = []
    '''
    根据微博文本进行广告微博筛选
    '''
    if count:
        for weibo in get_statuses_results():
            results_list.append([weibo['_id'],weibo['text']])
        scount, data_wid = ad_classifier(results_list)
    else:
        data_wid = []
        scount = 0
    print 'count_after_nad:', scount
    
    new_attribute_dict = {} # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count
    ds_new_attribute_dict = {} # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count    
    new_query_dict = {
        '$or':[]
        } # 用于查询retweeted_mid对应的weibo内容
    ds_new_query_dict ={
        '$or':[]
        } # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容
    map_dict = {} # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系
    ds_map_dict = {} # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系
    get_statuses_results = [r for r in get_statuses_results() if r['retweeted_uid'] != 0]
    set_repost_name = set()
    for status in get_statuses_results:
        if str(status['_id']) in data_wid:
            '''
            当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息
            '''
            nad_uid = status['user']
            nad_id = status['_id']
            r_uid = status['retweeted_uid']
            r_mid = status['retweeted_mid']
            if attribute_add == True:
                text_add = status['text']
                reposts_count_add = status['reposts_count']
                comment_count_add = status['comments_count']
                attitude_count_add = status['attitude_count']
                timestamp_add = status['timestamp']
                try:
                    new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid])
                    ds_new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid])
                except:
                    new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]]
                    ds_new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]]
            #print 'len(new_attribute_dict):', len(new_attribute_dict)
            '''
            区别于原创微博
            当是转发微博时,获取直接转发上级----例子:[a b c]->b
            '''
            if status['retweeted_uid'] and status['retweeted_uid']!=0:
                print 'before get_superior_userid'
                
                direct_superior_userid = get_superior_userid(status) # 获取直接转发上级--只获取一跳
                '''
                repost_name = get_superior_userid(status) # test
                set_repost_name.add(repost_name) # test
                '''
                
                print 'user_id', direct_superior_userid
                if not direct_superior_userid:
                    
                    #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博
                    
                    direct_superior_userid = r_uid
                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    ds_new_query_dict['$or'].append({'_id':r_mid})
                    ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
                    # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts
                else:
                    
                    #存在直接转发上级
                    
                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    if attribute_add == 'True':
                        weibo_text = status['text']
                        # 获取直接上级微博文本内容
                        weibo_test1 = weibo_text.split('//@') # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构
                        weibo_test2 = weibo_test1[1]
                        m_index = weibo_test2.find(':')
                        direct_superior_weibo = weibo_test2[m_index+1:]
                        m_all_index = weibo_text.find(':')
                        direct_superior_weibos = weibo_text[m_all_index+1:]
                        
                        #需要根据文本内容和r_uid获取timestamp
                        
                        direct_superior_info = get_ds_info(direct_superior_weibos, direct_superior_userid, topic, timestamp_add, topic_xapian_id) # timestamp_add是最终转发微博额时间戳
                        # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息
                        # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面
                        # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid]
                        timestamp = direct_superior_info['timestamp']
                        comment_count = direct_superior_info['comments_count']
                        attitude_count = direct_superior_info['attitude_count']
                        reposts_count = direct_superior_info['reposts_count']
                        retweeted_uid = direct_superior_info['retweeted_uid']
                        try:
                            ds_new_attribute_dict[direct_superior_userid].append([direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid])
                        except:
                            ds_new_attribute_dict[direct_superior_userid] = [[direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid]]

                print 'after get_superior_userid'     
            
            
            try:
                
                #源头转发网络构建
                
                if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                    repost_uid = status['user']
                    source_uid = status['retweeted_uid']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图
                    gg.add_edge(repost_uid, source_uid)
                    new_query_dict['$or'].append({'_id':r_mid}) # 为了查询转发微博的内容
                    map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
            except (TypeError, KeyError):
                continue
            
    
    
    print 'step_1:g', len(g)
    print 'step_1:ds_dg', len(ds_dg)
    
    # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理
    if attribute_add == True: # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中
        ruid_count, r_results = statuses_search.search(query=new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'])
        for rresult in r_results():
            text = rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
            text_spl = text.split('//@')
            try:
                text_add = text_spl[0]
            except:
                text_add = text
            timestamp_add = rresult['timestamp']
            reposts_count_add = rresult['reposts_count']
            comment_count_add = rresult['comments_count']
            attitude_count_add = rresult['attitude_count']
            ruid_add = rresult['retweeted_uid']
            try:
                new_attribute_dict[rresult['user']].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add])
            except:
                new_attribute_dict[rresult['user']] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]]
        #print 'map_dict:', map_dict
        new_attribute_dict = check_attribute(new_attribute_dict, new_query_dict, map_dict) # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理
        #print 'quer_dict:', ds_new_query_dict
        print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict)
        if query_dict!={'$or':[]}:
            ds_ruid_count, ds_r_results = statuses_search.search(query=ds_new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'])
            for ds_rresult in ds_r_results():
                uid = ds_rresult['user']
                timestamp_add = ds_rresult['timestamp']
                text = ds_rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
                text_spl = text.split('//@')
                try:
                    text_add = text_spl[0]
                except:
                    text_add = text
                reposts_count_add = ds_rresult['reposts_count']
                comment_count_add = ds_rresult['comments_count']
                attitude_count_add = ds_rresult['attitude_count']
                ruid_add = rresult['retweeted_uid']
                try:
                    ds_new_attribute_dict[uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add])
                except:
                    ds_new_attribute_dict[uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]]
        
        ds_new_attribute_dict = check_attribute(ds_new_attribute_dict, ds_new_query_dict, ds_map_dict)
    #print 'new_attribute_dict:', new_attribute_dict
    print 'len(g):', len(g)
    print 'len(ds_dg):', len(ds_dg)
    return g , gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict
    '''
Пример #14
0
def make_network(topic,
                 date,
                 window_size,
                 topic_xapian_id,
                 max_size=100000,
                 attribute_add=False):
    topics = topic.strip().split(',')
    end_time = int(datetime2ts(date))
    start_time = int(end_time - window2time(window_size))
    print 'start, end:', start_time, end_time
    #topic_id='545f4c22cf198b18c57b8014'
    topic_id = topic_xapian_id
    statuses_search = getXapianWeiboByTopic(topic_id)
    '''
    count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}})
    for i in test_results():
        print i
        break
    '''

    g = nx.DiGraph()  # 初始化一个有向图
    gg = nx.Graph()  # 为计算quota初始化一个无向图
    ds_dg = nx.DiGraph()  # direct_superior_network 有向直接上级转发网络
    ds_udg = nx.Graph()  # undirect_superior_network 无向直接上级转发网络
    query_dict = {'timestamp': {'$gt': start_time, '$lt': end_time}}
    #need repost index
    ''' 
    for ctopic in topics:
        query_dict['topics'].append(ctopic)
    '''
    print 'query_dict:', query_dict
    count, get_statuses_results = statuses_search.search(
        query=query_dict,
        fields=[
            '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid',
            'text', 'reposts_count', 'comments_count', 'attitude_count'
        ],
        max_offset=max_size)
    print 'count_before_nad:', count
    results_list = []
    '''
    根据微博文本进行广告微博筛选
    '''
    if count:
        for weibo in get_statuses_results():
            results_list.append([weibo['_id'], weibo['text']])
        scount, data_wid = ad_classifier(results_list)
    else:
        data_wid = []
        scount = 0
    print 'count_after_nad:', scount

    new_attribute_dict = {
    }  # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count
    ds_new_attribute_dict = {
    }  # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count
    new_query_dict = {'$or': []}  # 用于查询retweeted_mid对应的weibo内容
    ds_new_query_dict = {
        '$or': []
    }  # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容
    map_dict = {
    }  # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系
    ds_map_dict = {
    }  # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系
    get_statuses_results = [
        r for r in get_statuses_results() if r['retweeted_uid'] != 0
    ]
    set_repost_name = set()
    for status in get_statuses_results:
        if str(status['_id']) in data_wid:
            '''
            当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息
            '''
            nad_uid = status['user']
            nad_id = status['_id']
            r_uid = status['retweeted_uid']
            r_mid = status['retweeted_mid']
            if attribute_add == True:
                text_add = status['text']
                reposts_count_add = status['reposts_count']
                comment_count_add = status['comments_count']
                attitude_count_add = status['attitude_count']
                timestamp_add = status['timestamp']
                try:
                    new_attribute_dict[nad_uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ])
                    ds_new_attribute_dict[nad_uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ])
                except:
                    new_attribute_dict[nad_uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ]]
                    ds_new_attribute_dict[nad_uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ]]
            #print 'len(new_attribute_dict):', len(new_attribute_dict)
            '''
            区别于原创微博
            当是转发微博时,获取直接转发上级----例子:[a b c]->b
            '''
            if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                print 'before get_superior_userid'

                direct_superior_userid = get_superior_userid(
                    status)  # 获取直接转发上级--只获取一跳
                '''
                repost_name = get_superior_userid(status) # test
                set_repost_name.add(repost_name) # test
                '''

                print 'user_id', direct_superior_userid
                if not direct_superior_userid:

                    #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博

                    direct_superior_userid = r_uid
                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    ds_new_query_dict['$or'].append({'_id': r_mid})
                    ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
                    # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts
                else:

                    #存在直接转发上级

                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    if attribute_add == 'True':
                        weibo_text = status['text']
                        # 获取直接上级微博文本内容
                        weibo_test1 = weibo_text.split(
                            '//@'
                        )  # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构
                        weibo_test2 = weibo_test1[1]
                        m_index = weibo_test2.find(':')
                        direct_superior_weibo = weibo_test2[m_index + 1:]
                        m_all_index = weibo_text.find(':')
                        direct_superior_weibos = weibo_text[m_all_index + 1:]

                        #需要根据文本内容和r_uid获取timestamp

                        direct_superior_info = get_ds_info(
                            direct_superior_weibos, direct_superior_userid,
                            topic, timestamp_add,
                            topic_xapian_id)  # timestamp_add是最终转发微博额时间戳
                        # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息
                        # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面
                        # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid]
                        timestamp = direct_superior_info['timestamp']
                        comment_count = direct_superior_info['comments_count']
                        attitude_count = direct_superior_info['attitude_count']
                        reposts_count = direct_superior_info['reposts_count']
                        retweeted_uid = direct_superior_info['retweeted_uid']
                        try:
                            ds_new_attribute_dict[
                                direct_superior_userid].append([
                                    direct_superior_weibo, reposts_count,
                                    comment_count, attitude_count, timestamp,
                                    retweeted_uid
                                ])
                        except:
                            ds_new_attribute_dict[direct_superior_userid] = [[
                                direct_superior_weibo, reposts_count,
                                comment_count, attitude_count, timestamp,
                                retweeted_uid
                            ]]

                print 'after get_superior_userid'

            try:

                #源头转发网络构建

                if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                    repost_uid = status['user']
                    source_uid = status['retweeted_uid']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(
                            source_uid):
                        continue
                    g.add_edge(
                        repost_uid,
                        source_uid)  # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图
                    gg.add_edge(repost_uid, source_uid)
                    new_query_dict['$or'].append({'_id': r_mid})  # 为了查询转发微博的内容
                    map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
            except (TypeError, KeyError):
                continue

    print 'step_1:g', len(g)
    print 'step_1:ds_dg', len(ds_dg)

    # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理
    if attribute_add == True:  # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中
        ruid_count, r_results = statuses_search.search(
            query=new_query_dict,
            fields=[
                '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid',
                'text', 'reposts_count', 'comments_count', 'attitude_count'
            ])
        for rresult in r_results():
            text = rresult[
                'text']  # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
            text_spl = text.split('//@')
            try:
                text_add = text_spl[0]
            except:
                text_add = text
            timestamp_add = rresult['timestamp']
            reposts_count_add = rresult['reposts_count']
            comment_count_add = rresult['comments_count']
            attitude_count_add = rresult['attitude_count']
            ruid_add = rresult['retweeted_uid']
            try:
                new_attribute_dict[rresult['user']].append([
                    text_add, reposts_count_add, comment_count_add,
                    attitude_count_add, timestamp_add, ruid_add
                ])
            except:
                new_attribute_dict[rresult['user']] = [[
                    text_add, reposts_count_add, comment_count_add,
                    attitude_count_add, timestamp_add, ruid_add
                ]]
        #print 'map_dict:', map_dict
        new_attribute_dict = check_attribute(
            new_attribute_dict, new_query_dict,
            map_dict)  # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理
        #print 'quer_dict:', ds_new_query_dict
        print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict)
        if query_dict != {'$or': []}:
            ds_ruid_count, ds_r_results = statuses_search.search(
                query=ds_new_query_dict,
                fields=[
                    '_id', 'user', 'timestamp', 'retweeted_mid',
                    'retweeted_uid', 'text', 'reposts_count', 'comments_count',
                    'attitude_count'
                ])
            for ds_rresult in ds_r_results():
                uid = ds_rresult['user']
                timestamp_add = ds_rresult['timestamp']
                text = ds_rresult[
                    'text']  # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
                text_spl = text.split('//@')
                try:
                    text_add = text_spl[0]
                except:
                    text_add = text
                reposts_count_add = ds_rresult['reposts_count']
                comment_count_add = ds_rresult['comments_count']
                attitude_count_add = ds_rresult['attitude_count']
                ruid_add = rresult['retweeted_uid']
                try:
                    ds_new_attribute_dict[uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, ruid_add
                    ])
                except:
                    ds_new_attribute_dict[uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, ruid_add
                    ]]

        ds_new_attribute_dict = check_attribute(ds_new_attribute_dict,
                                                ds_new_query_dict, ds_map_dict)
    #print 'new_attribute_dict:', new_attribute_dict
    print 'len(g):', len(g)
    print 'len(ds_dg):', len(ds_dg)
    return g, gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict
    '''
Пример #15
0
def get_pushers(topic, new_peaks, new_bottom, ts_list, topic_xapian_id):
    #unit = 900
    #p_during = Hour
    p_ts_list = []
    results = []
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    print 'pusher_start_ts:', ts2date(begin_ts)
    print 'pusher_end_ts:', ts2date(end_ts)
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    interval = (end_ts - begin_ts) / p_during
    for i in range(interval, 0, -1):
        begin_ts = end_ts - p_during * i
        over_ts = begin_ts + p_during
        p_ts_list.append(over_ts)
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))
    #print 'pusher_line:', results
    max_k_timestamp = get_max_k_timestamp(results, p_ts_list)  # 获取增速最快的时间点
    #save max_k_timestamp
    # save_mak_k(max_k_timestamp)
    end = max_k_timestamp
    start = max_k_timestamp - p_during
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    query_dict = {'timestamp': {'$gt': end, '$lt': end + 3600}}
    '''
    count , results = xapian_search_weibo.search(query=query_dict, fields=['_id', 'user','retweeted_uid','retweeted_mid', 'timestamp'])
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_pushers = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True)
    print 'top_trend_pusher_uid:',sorted_pushers
    pusher_list = []
    for pusher in sorted_pushers:
        uid = pusher[0]
        mid = ruid_mid[uid]
        value = pusher[1]
    '''
    #以上是找到斜率最大的时间段内所有转发微博集中地源头用户--但是介于这些用户的相关信息找不到,因而选择使用下面的方法
    #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户
    count, results = xapian_search_weibo.search(
        query=query_dict,
        sort_by=['reposts_count'],
        fields=['_id', 'user', 'reposts_count'])
    print 'pusher_search_count:', count
    print 'pusher_query_dict:', query_dict
    pusher_list = []
    count = 0
    for result in results():
        count += 1
        if count > 100:
            break
        wid = result['_id']
        uid = result['user']
        value = result['reposts_count']
        pusher_list.append((uid, wid, value))
    # sort by reposts_count
    # sort_by_rc(pusher_list)
    return pusher_list
Пример #16
0
def sort_makers(keyword_data, begin_ts, end_ts, ts_list, topic_xapian_id):
    '''
    if begin_ts == ts_list[0]:
        start = begin_ts + 2 * Day
        end = begin_ts + 2 * Day + 12 * Hour
    else:
        start = begin_ts + 2 * Day - 6 * Hour
        end = begin_ts + 2 * Day + 12 * Hour
    query_dict = {
        'timestamp' : {'$gt':start, '$lt':end}
        }
    print 'sort_maker-query_dict:', query_dict
    xapian_search_weibo = getXapianWeiboByTopic(topic_id='545f4c22cf198b18c57b8014')
    count , search_weibos = xapian_search_weibo.search(query=query_dict, fields=field_list)
    print 'sort_makers:', count
    if count == 0:
        return []
    weibo_term = {}
    for weibo in search_weibos():
        uid = weibo['user']
        wid = weibo['_id']
        terms_list = weibo['terms']
        key_term_count = 0
        for term in terms_list:
            term = term.decode('utf-8')
            #print 'term:', term, type(term)
            #print 'keyword_data:', keyword_data[0], type(keyword_data[0])
            if term in keyword_data:
                key_term_count += 1
        weibo_term[uid] = [wid, key_term_count] 
    sort_weibo_term = sorted(weibo_term.items(), key=lambda x:x[1][1], reverse=True)
    '''
    begin_ts = begin_ts - Hour
    query_dict = {'timestamp': {'$gt': begin_ts, '$lt': end_ts}}
    xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id)
    count, search_weibo = xapian_search_weibo.search(query=query_dict,
                                                     sort_by=['-timestamp'],
                                                     fields=field_list)
    num = 0
    if count == 0:
        return []
    weibo_term = {}
    for weibo in search_weibo():
        num += 1
        if num > fu_tr_top_keyword:
            break
        uid = weibo['user']
        wid = weibo['_id']
        terms_list = weibo['terms']
        key_term_count = 0
        key_term = []
        for term in terms_list:
            term = term.decode('utf-8')
            if term in keyword_data:
                key_term_count += 1
                key_term.append(term)
        weibo_term[uid] = [wid, key_term_count, key_term]
    sort_weibo_term = sorted(weibo_term.items(),
                             key=lambda x: x[1][1],
                             reverse=True)
    return sort_weibo_term[:fu_tr_top_keyword]
Пример #17
0
def get_first_node(topic, start_date, date, windowsize, topic_xapian_id):
    '''
    根据timestamp,获取top20的用户----微博可能就不只20条了
    根据微博获取对应的用户信息------可能会出现用户重复的情况,这里只取时间最早的那一个
    将其保存
    '''
    print 'first_user_topic_id:', topic_xapian_id
    if topic and topic != '':
        datestr = start_date.replace('-', '')
        xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id)
        begin_ts = datetime2ts(start_date)
        end_ts = datetime2ts(date)
        topics = topic.strip().split(',')

        query_dict = {
            'timestamp': {
                '$gte': begin_ts,
                '$lte': end_ts
            },
            '$or': [{
                'message_type': 1
            }, {
                'message_type': 3
            }]
        }

        #query_dict = {'$or':[{'message_type':1}, {'message_type':3}]}
        print 'first_user_query:', query_dict
        # 这里只选取原创和转发微博进行计算
        '''
        for c_topic in topics:
            query_dict['$and'].append({'topics': c_topic})
        '''
        time_top_nodes = xapian_search_weibo.search(query=query_dict,
                                                    sort_by=['-timestamp'],
                                                    fields=fields_list)
        user_list = []
        if not time_top_nodes:
            print 'search error'
        else:
            #print 'time_top_nodes:', time_top_nodes
            s = 0
            '''
            domain_count_list = {'folk':0, 'media':0, 'opinion_leader':0, 'oversea':0, 'other':0}
            domain_user_list = {'folk':[], 'media':[], 'opinion_leader':[], 'oversea':[], 'other':[]}
            '''
            domain_count_list, domain_user_list = init_domain_list()

            print 'start_node:'
            for node in time_top_nodes[1]():
                #print 'node:', node
                uid = node['user']
                user_domain = uid2domain(uid)
                timestamp = node['timestamp']
                user_info = get_user_info(uid)  # 获取top_time微博对应的用户信息
                if s < first_user_count:
                    if user_info and (not (uid in user_list)):
                        s += 1
                        weibo_info = node
                        user_list.append(uid)
                        save_first_nodes(topic, date, windowsize, uid,
                                         timestamp, user_info, weibo_info,
                                         user_domain)
                #if domain_count_list == {'folk':first_user_count, 'media':first_user_count, 'opinion_leader':first_user_count, 'oversea':first_user_count, 'other':first_user_count}:
                #    break
                stop_s = 0
                for domain in domain_list:
                    if domain_count_list[domain] == first_user_count:
                        stop_s += 1
                if stop_s == len(domain_list):
                    break

                for domain in domain_list:
                    if domain_count_list[domain] >= first_user_count:
                        continue
                    elif user_domain == domain:
                        if user_info and (not (uid
                                               in domain_user_list[domain])):
                            domain_user_list[domain].append(uid)
                            domain_count_list[domain] += 1
                            rank = domain_count_list[domain]
                            save_domain_nodes(topic, date, windowsize, uid,
                                              timestamp, user_info, weibo_info,
                                              user_domain, rank)
Пример #18
0
def get_pushers(topic, new_peaks, new_bottom, ts_list, topic_xapian_id):
    #unit = 900
    #p_during = Hour
    p_ts_list = []
    results = []
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    print 'pusher_start_ts:', ts2date(begin_ts)
    print 'pusher_end_ts:', ts2date(end_ts)
    if begin_ts>end_ts:
        begin_ts = ts_list[0]
    interval = (end_ts - begin_ts) / p_during
    for i in range(interval, 0, -1):
        begin_ts = end_ts - p_during * i
        over_ts = begin_ts + p_during
        p_ts_list.append(over_ts)
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))
    #print 'pusher_line:', results
    max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点
    #save max_k_timestamp
    # save_mak_k(max_k_timestamp)
    end = max_k_timestamp
    start = max_k_timestamp - p_during
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    query_dict = {
        'timestamp':{'$gt':end, '$lt':end+3600}
        }
    '''
    count , results = xapian_search_weibo.search(query=query_dict, fields=['_id', 'user','retweeted_uid','retweeted_mid', 'timestamp'])
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_pushers = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True)
    print 'top_trend_pusher_uid:',sorted_pushers
    pusher_list = []
    for pusher in sorted_pushers:
        uid = pusher[0]
        mid = ruid_mid[uid]
        value = pusher[1]
    '''
    #以上是找到斜率最大的时间段内所有转发微博集中地源头用户--但是介于这些用户的相关信息找不到,因而选择使用下面的方法
    #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户
    count ,results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count'])
    print 'pusher_search_count:', count
    print 'pusher_query_dict:', query_dict
    pusher_list = []
    count = 0
    for result in results():
        count += 1
        if count>100:
            break
        wid = result['_id']
        uid = result['user']
        value = result['reposts_count']
        pusher_list.append((uid, wid, value))
    # sort by reposts_count
    # sort_by_rc(pusher_list)
    return pusher_list
Пример #19
0
from case.identify import utils as identifyModule
import search as searchModule
from case.time_utils import ts2datetime, ts2date
from xapian_case.xapian_backend import XapianSearch
from xapian_case.utils import cut, load_scws
from case.dynamic_xapian_weibo import getXapianWeiboByTopic
from case.global_config import XAPIAN_USER_DATA_PATH
from case.Database import Event, EventManager
from case.topic_manage import topics_name_start_end
from flask import Blueprint, url_for, render_template, request, abort, flash, session, redirect, make_response

scws = load_scws()

mod = Blueprint('case', __name__, url_prefix='/index')

xapian_search_weibo = getXapianWeiboByTopic()

em = EventManager()


def acquire_user_by_id(uid):
    user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH,
                               name='master_timeline_user',
                               schema_version=1)
    result = user_search.search_by_id(int(uid),
                                      fields=[
                                          'name', 'location',
                                          'followers_count', 'friends_count',
                                          'profile_image_url'
                                      ])
    user = {}
Пример #20
0
            location_dict['mid'] = r['_id']
            location_dict['topic'] = topic
            location_dict['ts'] = r['timestamp']
            location_dict['origin_location'] = origin_location.split('\t')[1]
            location_dict['repost_location'] = None
            return location_dict

    return None


if __name__ == '__main__':
    START_TS = datetime2ts('2015-03-02')
    END_TS = datetime2ts('2015-03-15')

    topic = u'两会2015'
    topic_id = getTopicByName(topic)['_id']
    print 'topic: ', topic.encode('utf8')
    print topic_id, START_TS, END_TS

    xapian_search = getXapianWeiboByTopic(topic_id)
    repost_search(topic, START_TS, END_TS)
    """
    item_exist = db.session.query(CityRepost).filter(CityRepost.topic == topic).all()

    if item_exist:
        for item in item_exist:
            db.session.delete(item)
    db.session.commit()
    print 'commited'
    """