Exemplo n.º 1
0
def make_network(topic, date, window_size, max_size=100000, ts=False):
    end_time = datetime2ts(date)
    start_time = end_time - window2time(window_size)

    g = nx.DiGraph()

    #need repost index
    topic = cut(s, topic.encode('utf-8'))
    query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}}

    if ts:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size)
    else:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size)
    print 'topic statuses count %s' % count

    if ts:
        uid_ts = {}
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    repost_ts = int(status['timestamp'])
                    source_status = acquire_status_by_id(rt_mid)
                    source_uid = source_status['user']
                    source_ts = int(source_status['timestamp'])
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    if repost_uid not in uid_ts:
                        uid_ts[repost_uid] = repost_ts
                    else:
                        if uid_ts[repost_uid] > repost_ts:
                            uid_ts[repost_uid] = repost_ts
                    if source_uid not in uid_ts:
                        uid_ts[source_uid] = source_ts   
                    else:
                        if uid_ts[source_uid] > source_ts:
                            uid_ts[source_uid] = source_ts
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return uid_ts, g
    else:
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    source_uid = acquire_status_by_id(rt_mid)['user']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return g
Exemplo n.º 2
0
def auto():
    
    #清除原数据
    for s in db.session.query(HotStatus):
        db.session.delete(s)
    db.session.commit()
    
    delta = datetime.timedelta(days = 90)
##    now = date.today()
    now = date.today()
    updatetime = date.today()
    now_year = int(now.year)
    now_month = int(now.month)
    now_day = int(now.day)
    end_ts1 = time.mktime(datetime.datetime(now_year, now_month, now_day).timetuple())

    bg = now - delta
    begin_ts1 = time.mktime(datetime.datetime(int(bg.year),int(bg.month),int(bg.day)).timetuple())
    
    
    #s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
    count,get_results = s.search(query={'timestamp': {'$gt': begin_ts1, '$lt': end_ts1} }, sort_by=['-reposts_count'], max_offset = 10000,fields=['text','attitudes_count', 'source', 'timestamp','reposts_count','comments_count','user', '_id','retweeted_status'])
    print count
    i = 0
    for s in get_results():
	if i < 100:
	    s_id = s['_id']
	    text = s['text']
	    if s['source']:
                sourcePlatform = re.match('<.*?>(.*)<.*?>', s['source']).group(1)
            else:
                sourcePlatform = u'未知平台'
	    postDate = datetime.datetime.fromtimestamp(s['timestamp'])
	    if s['user']:
                uid = s['user']
            else:
                uid = None
            if s['retweeted_status']:
                retweetedMid = s['retweeted_status']
            else:
                retweetedMid = None
            if s['reposts_count']:
                repostsCount = s['reposts_count']
            else:
                repostsCount = 0
            if s['comments_count']:
                commentsCount = s['comments_count']
            else:
                commentsCount = 0
            if s['attitudes_count']:
                attitudesCount = s['attitudes_count']
            else:
                attitudesCount = 0
	    updateTime = updatetime
	    hot_status = HotStatus(id = s_id,text = text,sourcePlatform = sourcePlatform,postDate =postDate, uid = uid, retweetedMid = retweetedMid, repostsCount = repostsCount, commentsCount = commentsCount, attitudesCount = attitudesCount, updateTime = updateTime)
	    db.session.add(hot_status)
	    db.session.commit()
	    print i
	    i += 1
	else:
	    break