Exemplo n.º 1
0
def make_network(topic, date, window_size, max_size=100000):
    end_time = datetime2ts(date)
    start_time = end_time - window2time(window_size)

    statuses_search = getXapianweiboByTs(start_time, end_time)

    g = nx.DiGraph()

    #need repost index
    query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}}

    count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', 'retweeted_uid'], max_offset=max_size)
    print 'topic statuses count %s' % count

    for status in get_statuses_results():
        try:
            if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                repost_uid = status['user']
                source_uid = status['retweeted_uid']
                if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                    continue
                g.add_edge(repost_uid, source_uid)
        except (TypeError, KeyError):
            continue
    return g
Exemplo n.º 2
0
def make_network(topic, date, window_size, max_size=100000, ts=False):
    end_time = datetime2ts(date)
    start_time = end_time - window2time(window_size)

    g = nx.DiGraph()

    #need repost index
    topic = cut(s, topic.encode('utf-8'))
    statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
    query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}}

    if ts:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size)
    else:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size)
    print 'topic statuses count %s' % count

    if ts:
        uid_ts = {}
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    repost_ts = int(status['timestamp'])
                    source_status = acquire_status_by_id(rt_mid)
                    source_uid = source_status['user']
                    source_ts = int(source_status['timestamp'])
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    if repost_uid not in uid_ts:
                        uid_ts[repost_uid] = repost_ts
                    else:
                        if uid_ts[repost_uid] > repost_ts:
                            uid_ts[repost_uid] = repost_ts
                    if source_uid not in uid_ts:
                        uid_ts[source_uid] = source_ts   
                    else:
                        if uid_ts[source_uid] > source_ts:
                            uid_ts[source_uid] = source_ts
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return uid_ts, g
    else:
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    source_uid = acquire_status_by_id(rt_mid)['user']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return g
Exemplo n.º 3
0
def important_rank(top_n, date, window_size):
    date_time = datetime2ts(date)
    uid_important = {}
    if window_size == 1:
        db_name = get_leveldb('important', date_time)
        daily_user_important_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name),
                                              block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

        count = 0
        for uid, important in daily_user_important_bucket.RangeIter():
            count = count + 1
            print count
            uid = int(uid)
            important = float(important)
            uid_important[uid] = important
    else:
        for i in range(window_size):
            db_name = get_leveldb('important', date_time - i*24*60*60)
            daily_user_important_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name),
                                              block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
            for uid, important in daily_user_important_bucket.RangeIter():
                uid = int(uid)
                important = float(important)
                if uid not in uid_important:
                    uid_important[uid] = 0
                uid_important[uid] += important

    if len(uid_important) < 100000000:
        sorted_uid_important = sorted(uid_important.iteritems(), key=operator.itemgetter(1), reverse=True)
        sorted_uids = []
        count = 0
        for uid, value in sorted_uid_important:
            if is_in_trash_list(uid):
                continue
            if count >= top_n:
                break
            sorted_uids.append(uid)
        count += 1
    else:
        sorted_uids = user_rank(uid_important, 'whole_active', top_n, date, window_size)


    return sorted_uids
Exemplo n.º 4
0
def make_network(topic,
                 date,
                 window_size,
                 topic_xapian_id,
                 max_size=100000,
                 attribute_add=False):
    topics = topic.strip().split(',')
    end_time = int(datetime2ts(date))
    start_time = int(end_time - window2time(window_size))
    print 'start, end:', start_time, end_time
    #topic_id='545f4c22cf198b18c57b8014'
    topic_id = topic_xapian_id
    statuses_search = getXapianWeiboByTopic(topic_id)
    '''
    count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}})
    for i in test_results():
        print i
        break
    '''

    g = nx.DiGraph()  # 初始化一个有向图
    gg = nx.Graph()  # 为计算quota初始化一个无向图
    ds_dg = nx.DiGraph()  # direct_superior_network 有向直接上级转发网络
    ds_udg = nx.Graph()  # undirect_superior_network 无向直接上级转发网络
    query_dict = {'timestamp': {'$gt': start_time, '$lt': end_time}}
    #need repost index
    ''' 
    for ctopic in topics:
        query_dict['topics'].append(ctopic)
    '''
    print 'query_dict:', query_dict
    count, get_statuses_results = statuses_search.search(
        query=query_dict,
        fields=[
            '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid',
            'text', 'reposts_count', 'comments_count', 'attitude_count'
        ],
        max_offset=max_size)
    print 'count_before_nad:', count
    results_list = []
    '''
    根据微博文本进行广告微博筛选
    '''
    if count:
        for weibo in get_statuses_results():
            results_list.append([weibo['_id'], weibo['text']])
        scount, data_wid = ad_classifier(results_list)
    else:
        data_wid = []
        scount = 0
    print 'count_after_nad:', scount

    new_attribute_dict = {
    }  # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count
    ds_new_attribute_dict = {
    }  # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count
    new_query_dict = {'$or': []}  # 用于查询retweeted_mid对应的weibo内容
    ds_new_query_dict = {
        '$or': []
    }  # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容
    map_dict = {
    }  # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系
    ds_map_dict = {
    }  # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系
    get_statuses_results = [
        r for r in get_statuses_results() if r['retweeted_uid'] != 0
    ]
    set_repost_name = set()
    for status in get_statuses_results:
        if str(status['_id']) in data_wid:
            '''
            当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息
            '''
            nad_uid = status['user']
            nad_id = status['_id']
            r_uid = status['retweeted_uid']
            r_mid = status['retweeted_mid']
            if attribute_add == True:
                text_add = status['text']
                reposts_count_add = status['reposts_count']
                comment_count_add = status['comments_count']
                attitude_count_add = status['attitude_count']
                timestamp_add = status['timestamp']
                try:
                    new_attribute_dict[nad_uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ])
                    ds_new_attribute_dict[nad_uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ])
                except:
                    new_attribute_dict[nad_uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ]]
                    ds_new_attribute_dict[nad_uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, r_uid
                    ]]
            #print 'len(new_attribute_dict):', len(new_attribute_dict)
            '''
            区别于原创微博
            当是转发微博时,获取直接转发上级----例子:[a b c]->b
            '''
            if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                print 'before get_superior_userid'

                direct_superior_userid = get_superior_userid(
                    status)  # 获取直接转发上级--只获取一跳
                '''
                repost_name = get_superior_userid(status) # test
                set_repost_name.add(repost_name) # test
                '''

                print 'user_id', direct_superior_userid
                if not direct_superior_userid:

                    #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博

                    direct_superior_userid = r_uid
                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    ds_new_query_dict['$or'].append({'_id': r_mid})
                    ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
                    # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts
                else:

                    #存在直接转发上级

                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    if attribute_add == 'True':
                        weibo_text = status['text']
                        # 获取直接上级微博文本内容
                        weibo_test1 = weibo_text.split(
                            '//@'
                        )  # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构
                        weibo_test2 = weibo_test1[1]
                        m_index = weibo_test2.find(':')
                        direct_superior_weibo = weibo_test2[m_index + 1:]
                        m_all_index = weibo_text.find(':')
                        direct_superior_weibos = weibo_text[m_all_index + 1:]

                        #需要根据文本内容和r_uid获取timestamp

                        direct_superior_info = get_ds_info(
                            direct_superior_weibos, direct_superior_userid,
                            topic, timestamp_add,
                            topic_xapian_id)  # timestamp_add是最终转发微博额时间戳
                        # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息
                        # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面
                        # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid]
                        timestamp = direct_superior_info['timestamp']
                        comment_count = direct_superior_info['comments_count']
                        attitude_count = direct_superior_info['attitude_count']
                        reposts_count = direct_superior_info['reposts_count']
                        retweeted_uid = direct_superior_info['retweeted_uid']
                        try:
                            ds_new_attribute_dict[
                                direct_superior_userid].append([
                                    direct_superior_weibo, reposts_count,
                                    comment_count, attitude_count, timestamp,
                                    retweeted_uid
                                ])
                        except:
                            ds_new_attribute_dict[direct_superior_userid] = [[
                                direct_superior_weibo, reposts_count,
                                comment_count, attitude_count, timestamp,
                                retweeted_uid
                            ]]

                print 'after get_superior_userid'

            try:

                #源头转发网络构建

                if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                    repost_uid = status['user']
                    source_uid = status['retweeted_uid']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(
                            source_uid):
                        continue
                    g.add_edge(
                        repost_uid,
                        source_uid)  # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图
                    gg.add_edge(repost_uid, source_uid)
                    new_query_dict['$or'].append({'_id': r_mid})  # 为了查询转发微博的内容
                    map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
            except (TypeError, KeyError):
                continue

    print 'step_1:g', len(g)
    print 'step_1:ds_dg', len(ds_dg)

    # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理
    if attribute_add == True:  # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中
        ruid_count, r_results = statuses_search.search(
            query=new_query_dict,
            fields=[
                '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid',
                'text', 'reposts_count', 'comments_count', 'attitude_count'
            ])
        for rresult in r_results():
            text = rresult[
                'text']  # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
            text_spl = text.split('//@')
            try:
                text_add = text_spl[0]
            except:
                text_add = text
            timestamp_add = rresult['timestamp']
            reposts_count_add = rresult['reposts_count']
            comment_count_add = rresult['comments_count']
            attitude_count_add = rresult['attitude_count']
            ruid_add = rresult['retweeted_uid']
            try:
                new_attribute_dict[rresult['user']].append([
                    text_add, reposts_count_add, comment_count_add,
                    attitude_count_add, timestamp_add, ruid_add
                ])
            except:
                new_attribute_dict[rresult['user']] = [[
                    text_add, reposts_count_add, comment_count_add,
                    attitude_count_add, timestamp_add, ruid_add
                ]]
        #print 'map_dict:', map_dict
        new_attribute_dict = check_attribute(
            new_attribute_dict, new_query_dict,
            map_dict)  # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理
        #print 'quer_dict:', ds_new_query_dict
        print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict)
        if query_dict != {'$or': []}:
            ds_ruid_count, ds_r_results = statuses_search.search(
                query=ds_new_query_dict,
                fields=[
                    '_id', 'user', 'timestamp', 'retweeted_mid',
                    'retweeted_uid', 'text', 'reposts_count', 'comments_count',
                    'attitude_count'
                ])
            for ds_rresult in ds_r_results():
                uid = ds_rresult['user']
                timestamp_add = ds_rresult['timestamp']
                text = ds_rresult[
                    'text']  # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
                text_spl = text.split('//@')
                try:
                    text_add = text_spl[0]
                except:
                    text_add = text
                reposts_count_add = ds_rresult['reposts_count']
                comment_count_add = ds_rresult['comments_count']
                attitude_count_add = ds_rresult['attitude_count']
                ruid_add = rresult['retweeted_uid']
                try:
                    ds_new_attribute_dict[uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, ruid_add
                    ])
                except:
                    ds_new_attribute_dict[uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        attitude_count_add, timestamp_add, ruid_add
                    ]]

        ds_new_attribute_dict = check_attribute(ds_new_attribute_dict,
                                                ds_new_query_dict, ds_map_dict)
    #print 'new_attribute_dict:', new_attribute_dict
    print 'len(g):', len(g)
    print 'len(ds_dg):', len(ds_dg)
    return g, gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict
    '''
Exemplo n.º 5
0
def make_network(topic,
                 date,
                 window_size,
                 max_size=100000,
                 attribute_add=False):
    topics = topic.strip().split(',')
    end_time = int(datetime2ts(date))
    start_time = int(end_time - window2time(window_size))
    print 'start, end:', start_time, end_time
    query_body = {
        'query': {
            'bool': {
                'should': [
                    #{'term': {
                    #'message_type': 1
                    #}
                    #},
                    {
                        'term': {
                            'message_type': 3
                        }
                    }
                ],
                'must':
                # {'term':{'name': topic}},
                {
                    'range': {
                        'timestamp': {
                            'gte': start_time,
                            'lt': end_time
                        }
                    }
                }
            }
        },
        'size': max_size,  # 返回条数限制 待删
        #'sort': {"timestamp": {"order": "asc"}}
    }
    es_search_weibos = weibo_es.search(index=topic,
                                       doc_type=weibo_index_type,
                                       body=query_body)['hits']['hits']
    get_statuses_results = es_search_weibos
    g = nx.DiGraph()  # 初始化一个有向图
    gg = nx.Graph()  # 为计算quota初始化一个无向图
    results_list = []
    '''
    根据微博文本进行广告微博筛选
    '''
    if len(es_search_weibos) > 1:
        for weibo in get_statuses_results:
            results_list.append(
                [weibo['_source']['mid'], weibo['_source']['text']])
        scount, data_wid = ad_classifier(results_list)
        #print data_wid
    else:
        data_wid = []
        scount = 0
    print 'count_after_nad:', scount

    new_attribute_dict = {
    }  # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count
    map_dict = {
    }  # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系
    ds_map_dict = {
    }  # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系
    get_statuses_results = [
        r for r in get_statuses_results if r['_source']['uid'] != 0
    ]
    #print get_statuses_results
    print len(get_statuses_results)
    set_repost_name = set()
    for status in get_statuses_results:
        if str(status['_source']['mid']) in data_wid:
            #print status['_source']
            '''
            当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息
            '''
            nad_uid = status['_source']['uid']
            nad_id = status['_source']['mid']
            #r_uid = status['_source']['root_uid']
            #r_mid = status['_source']['root_mid']

            try:
                r_uid = status['_source']['root_uid']
                r_mid = status['_source']['root_mid']
            except:
                r_uid = 0
                r_mid = 0

            #print 'hahahahahahahahaha'
            if attribute_add == True:
                text_add = status['_source']['text']
                try:
                    reposts_count_add = status['_source']['retweeted']
                except:
                    reposts_count_add = 0
                try:
                    comment_count_add = status['_source']['comment']
                except:
                    comment_count_add = 0
                #attitude_count_add = status['_source']['attitude_count']
                timestamp_add = status['_source']['timestamp']
                try:
                    new_attribute_dict[nad_uid].append([
                        text_add, reposts_count_add, comment_count_add,
                        timestamp_add, r_uid
                    ])
                    #ds_new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, timestamp_add, r_uid])
                except:
                    new_attribute_dict[nad_uid] = [[
                        text_add, reposts_count_add, comment_count_add,
                        timestamp_add, r_uid
                    ]]
                    #ds_new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, timestamp_add, r_uid]]

            try:

                #源头转发网络构建
                if status['_source'][
                        'root_uid'] and status['_source']['root_uid'] != 0:
                    repost_uid = status['_source']['uid']
                    source_uid = status['_source']['root_uid']
                    # print '405',repost_uid,source_uid
                    if is_in_trash_list(repost_uid) or is_in_trash_list(
                            source_uid):
                        continue
                    g.add_edge(
                        repost_uid,
                        source_uid)  # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图
                    gg.add_edge(repost_uid, source_uid)
                    #new_query_dict['$or'].append({'_id':r_mid}) # 为了查询转发微博的内容
                    map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
            except (TypeError, KeyError):
                continue
    print g
    return g, gg, new_attribute_dict

    print 'step_1:g', len(g)
Exemplo n.º 6
0
def make_network(topic, date, window_size, topic_xapian_id, max_size=100000, attribute_add = False):
    topics = topic.strip().split(',')
    end_time = int(datetime2ts(date))
    start_time = int(end_time - window2time(window_size))
    print 'start, end:', start_time, end_time
    #topic_id='545f4c22cf198b18c57b8014'
    topic_id = topic_xapian_id
    statuses_search = getXapianWeiboByTopic(topic_id)
    '''
    count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}})
    for i in test_results():
        print i
        break
    '''

    g = nx.DiGraph() # 初始化一个有向图
    gg = nx.Graph() # 为计算quota初始化一个无向图
    ds_dg = nx.DiGraph() # direct_superior_network 有向直接上级转发网络
    ds_udg = nx.Graph() # undirect_superior_network 无向直接上级转发网络
    query_dict = {'timestamp':{'$gt': start_time, '$lt': end_time}}
    #need repost index
    ''' 
    for ctopic in topics:
        query_dict['topics'].append(ctopic)
    '''
    print 'query_dict:', query_dict
    count, get_statuses_results = statuses_search.search(query=query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'], max_offset=max_size)
    print 'count_before_nad:', count
    results_list = []
    '''
    根据微博文本进行广告微博筛选
    '''
    if count:
        for weibo in get_statuses_results():
            results_list.append([weibo['_id'],weibo['text']])
        scount, data_wid = ad_classifier(results_list)
    else:
        data_wid = []
        scount = 0
    print 'count_after_nad:', scount
    
    new_attribute_dict = {} # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count
    ds_new_attribute_dict = {} # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count    
    new_query_dict = {
        '$or':[]
        } # 用于查询retweeted_mid对应的weibo内容
    ds_new_query_dict ={
        '$or':[]
        } # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容
    map_dict = {} # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系
    ds_map_dict = {} # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系
    get_statuses_results = [r for r in get_statuses_results() if r['retweeted_uid'] != 0]
    set_repost_name = set()
    for status in get_statuses_results:
        if str(status['_id']) in data_wid:
            '''
            当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息
            '''
            nad_uid = status['user']
            nad_id = status['_id']
            r_uid = status['retweeted_uid']
            r_mid = status['retweeted_mid']
            if attribute_add == True:
                text_add = status['text']
                reposts_count_add = status['reposts_count']
                comment_count_add = status['comments_count']
                attitude_count_add = status['attitude_count']
                timestamp_add = status['timestamp']
                try:
                    new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid])
                    ds_new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid])
                except:
                    new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]]
                    ds_new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]]
            #print 'len(new_attribute_dict):', len(new_attribute_dict)
            '''
            区别于原创微博
            当是转发微博时,获取直接转发上级----例子:[a b c]->b
            '''
            if status['retweeted_uid'] and status['retweeted_uid']!=0:
                print 'before get_superior_userid'
                
                direct_superior_userid = get_superior_userid(status) # 获取直接转发上级--只获取一跳
                '''
                repost_name = get_superior_userid(status) # test
                set_repost_name.add(repost_name) # test
                '''
                
                print 'user_id', direct_superior_userid
                if not direct_superior_userid:
                    
                    #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博
                    
                    direct_superior_userid = r_uid
                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    ds_new_query_dict['$or'].append({'_id':r_mid})
                    ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
                    # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts
                else:
                    
                    #存在直接转发上级
                    
                    ds_dg.add_edge(nad_uid, direct_superior_userid)
                    ds_udg.add_edge(nad_uid, direct_superior_userid)
                    if attribute_add == 'True':
                        weibo_text = status['text']
                        # 获取直接上级微博文本内容
                        weibo_test1 = weibo_text.split('//@') # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构
                        weibo_test2 = weibo_test1[1]
                        m_index = weibo_test2.find(':')
                        direct_superior_weibo = weibo_test2[m_index+1:]
                        m_all_index = weibo_text.find(':')
                        direct_superior_weibos = weibo_text[m_all_index+1:]
                        
                        #需要根据文本内容和r_uid获取timestamp
                        
                        direct_superior_info = get_ds_info(direct_superior_weibos, direct_superior_userid, topic, timestamp_add, topic_xapian_id) # timestamp_add是最终转发微博额时间戳
                        # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息
                        # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面
                        # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid]
                        timestamp = direct_superior_info['timestamp']
                        comment_count = direct_superior_info['comments_count']
                        attitude_count = direct_superior_info['attitude_count']
                        reposts_count = direct_superior_info['reposts_count']
                        retweeted_uid = direct_superior_info['retweeted_uid']
                        try:
                            ds_new_attribute_dict[direct_superior_userid].append([direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid])
                        except:
                            ds_new_attribute_dict[direct_superior_userid] = [[direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid]]

                print 'after get_superior_userid'     
            
            
            try:
                
                #源头转发网络构建
                
                if status['retweeted_uid'] and status['retweeted_uid'] != 0:
                    repost_uid = status['user']
                    source_uid = status['retweeted_uid']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图
                    gg.add_edge(repost_uid, source_uid)
                    new_query_dict['$or'].append({'_id':r_mid}) # 为了查询转发微博的内容
                    map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']]
            except (TypeError, KeyError):
                continue
            
    
    
    print 'step_1:g', len(g)
    print 'step_1:ds_dg', len(ds_dg)
    
    # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理
    if attribute_add == True: # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中
        ruid_count, r_results = statuses_search.search(query=new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'])
        for rresult in r_results():
            text = rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
            text_spl = text.split('//@')
            try:
                text_add = text_spl[0]
            except:
                text_add = text
            timestamp_add = rresult['timestamp']
            reposts_count_add = rresult['reposts_count']
            comment_count_add = rresult['comments_count']
            attitude_count_add = rresult['attitude_count']
            ruid_add = rresult['retweeted_uid']
            try:
                new_attribute_dict[rresult['user']].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add])
            except:
                new_attribute_dict[rresult['user']] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]]
        #print 'map_dict:', map_dict
        new_attribute_dict = check_attribute(new_attribute_dict, new_query_dict, map_dict) # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理
        #print 'quer_dict:', ds_new_query_dict
        print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict)
        if query_dict!={'$or':[]}:
            ds_ruid_count, ds_r_results = statuses_search.search(query=ds_new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'])
            for ds_rresult in ds_r_results():
                uid = ds_rresult['user']
                timestamp_add = ds_rresult['timestamp']
                text = ds_rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本
                text_spl = text.split('//@')
                try:
                    text_add = text_spl[0]
                except:
                    text_add = text
                reposts_count_add = ds_rresult['reposts_count']
                comment_count_add = ds_rresult['comments_count']
                attitude_count_add = ds_rresult['attitude_count']
                ruid_add = rresult['retweeted_uid']
                try:
                    ds_new_attribute_dict[uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add])
                except:
                    ds_new_attribute_dict[uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]]
        
        ds_new_attribute_dict = check_attribute(ds_new_attribute_dict, ds_new_query_dict, ds_map_dict)
    #print 'new_attribute_dict:', new_attribute_dict
    print 'len(g):', len(g)
    print 'len(ds_dg):', len(ds_dg)
    return g , gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict
    '''
Exemplo n.º 7
0
def realtime_burst_user(top_n, current_time):
    current_datetime = datetime.fromtimestamp(current_time)
    current_hour = current_datetime.hour
    total_size = 0
    total_uid_count = {}
    for h in range(current_hour):
        db_name = get_leveldb(current_time, h)
        uid_group_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name),
                                              block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
        try:
            group_size = uid_group_bucket.Get('size')
        except KeyError:
            continue
        if group_size:
            group_size = int(group_size)
            if group_size > 0:
                total_size += group_size
        for uid, value in uid_group_bucket.RangeIter():
            if uid == 'size':
                continue
            uid = int(uid)
            value = float(value)
            if uid not in total_uid_count:
                total_uid_count[uid] = 0
            total_uid_count[uid] += value 

    uid_burst = {}
    for h in range(current_hour):
        db_name = get_leveldb(current_time, h)
        uid_group_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name),
                                              block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
        try:
            group_size = uid_group_bucket.Get('size')
        except KeyError:
            continue
        if group_size:
            group_size = int(group_size)
            if group_size <= 0:
                continue
        for uid, value in uid_group_bucket.RangeIter():
            if uid == 'size':
                continue
            uid = int(uid)
            value = float(value)
            A = value
            B = total_uid_count[uid] - A
            C = group_size - A
            D = total_size - total_uid_count[uid] - C
            if uid not in uid_burst:
                uid_burst[uid] = 0
            uid_burst[uid] += (A + B + C + D) * ((A*D - B*C) ** 2) * 1.0 / ((A + B) * (C + D) * (A + C) * (B + D))
    
    sorted_uid_burst = sorted(uid_burst.iteritems(), key=operator.itemgetter(1), reverse=True)
    sorted_uids = []
    count = 0
    for uid, value in sorted_uid_burst:
        if is_in_trash_list(uid):
            continue
        if count >= top_n:
            break
        sorted_uids.append(uid)
        count += 1

    data = generate_rank_results(sorted_uids)

    return data