def make_network(topic, date, window_size, max_size=100000, ts=False): end_time = datetime2ts(date) start_time = end_time - window2time(window_size) g = nx.DiGraph() #need repost index topic = cut(s, topic.encode('utf-8')) query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}} if ts: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size) else: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size) print 'topic statuses count %s' % count if ts: uid_ts = {} for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] repost_ts = int(status['timestamp']) source_status = acquire_status_by_id(rt_mid) source_uid = source_status['user'] source_ts = int(source_status['timestamp']) if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue if repost_uid not in uid_ts: uid_ts[repost_uid] = repost_ts else: if uid_ts[repost_uid] > repost_ts: uid_ts[repost_uid] = repost_ts if source_uid not in uid_ts: uid_ts[source_uid] = source_ts else: if uid_ts[source_uid] > source_ts: uid_ts[source_uid] = source_ts g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return uid_ts, g else: for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] source_uid = acquire_status_by_id(rt_mid)['user'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return g
def auto(): #清除原数据 for s in db.session.query(HotStatus): db.session.delete(s) db.session.commit() delta = datetime.timedelta(days = 90) ## now = date.today() now = date.today() updatetime = date.today() now_year = int(now.year) now_month = int(now.month) now_day = int(now.day) end_ts1 = time.mktime(datetime.datetime(now_year, now_month, now_day).timetuple()) bg = now - delta begin_ts1 = time.mktime(datetime.datetime(int(bg.year),int(bg.month),int(bg.day)).timetuple()) #s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) count,get_results = s.search(query={'timestamp': {'$gt': begin_ts1, '$lt': end_ts1} }, sort_by=['-reposts_count'], max_offset = 10000,fields=['text','attitudes_count', 'source', 'timestamp','reposts_count','comments_count','user', '_id','retweeted_status']) print count i = 0 for s in get_results(): if i < 100: s_id = s['_id'] text = s['text'] if s['source']: sourcePlatform = re.match('<.*?>(.*)<.*?>', s['source']).group(1) else: sourcePlatform = u'未知平台' postDate = datetime.datetime.fromtimestamp(s['timestamp']) if s['user']: uid = s['user'] else: uid = None if s['retweeted_status']: retweetedMid = s['retweeted_status'] else: retweetedMid = None if s['reposts_count']: repostsCount = s['reposts_count'] else: repostsCount = 0 if s['comments_count']: commentsCount = s['comments_count'] else: commentsCount = 0 if s['attitudes_count']: attitudesCount = s['attitudes_count'] else: attitudesCount = 0 updateTime = updatetime hot_status = HotStatus(id = s_id,text = text,sourcePlatform = sourcePlatform,postDate =postDate, uid = uid, retweetedMid = retweetedMid, repostsCount = repostsCount, commentsCount = commentsCount, attitudesCount = attitudesCount, updateTime = updateTime) db.session.add(hot_status) db.session.commit() print i i += 1 else: break