def subopinion_content(topic, start_ts, end_ts, weibo_limit): query_body = { 'query': { 'bool': { 'must_not': [{ 'wildcard': { 'text': '*【*】*' } }], 'must': [{ 'range': { 'timestamp': { 'lt': end_ts, 'gte': start_ts } } }] } }, 'size': weibo_limit } subopinion_results = weibo_es.search( index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #['_source'] normal_list = [] for key_weibo in subopinion_results: text_weibo = key_weibo['_source']['text'] mid_weibo = key_weibo['_source']['mid'] timestamp = key_weibo['_source']['timestamp'] try: comment = key_weibo['_source']['comment'] except: comment = 0 try: retweeted = key_weibo['_source']['retweeted'] except: retweeted = 0 uid = key_weibo['_source']['uid'] normal_list.append({ 'news_id': 'weibo', 'content': text_weibo, 'id': mid_weibo, 'datetime': ts2datetime_full(timestamp), 'comment': comment, 'retweeted': retweeted, 'uid': uid }) return normal_list
def news_content(topic, start_ts, end_ts, news_limit=NEWS_LIMIT): query_body = { 'query': { 'bool': { 'must': [{ 'wildcard': { 'text': '*【*】*' } }, { 'range': { 'timestamp': { 'lt': end_ts, 'gte': start_ts } } }] } }, 'size': news_limit } news_results = weibo_es.search( index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #['_source'] # print topic,weibo_index_type,start_ts,end_ts,query_body # print news_results news_list = [] for key_weibo in news_results: text_weibo = key_weibo['_source']['text'] mid_weibo = key_weibo['_source']['mid'] timestamp = key_weibo['_source']['timestamp'] comment = key_weibo['_source']['comment'] retweeted = key_weibo['_source']['retweeted'] uid = key_weibo['_source']['uid'] news_list.append({ 'news_id': 'news', 'content168': text_weibo, 'id': mid_weibo, 'datetime': ts2datetime_full(timestamp), 'comment': comment, 'retweeted': retweeted }) return news_list
def cron_index_topic(topic, begin_ts = BEGIN_TS, end_ts = END_TS): if topic and topic != '': topic = topic.strip() query_dict = { 'timestamp':{'$gt':begin_ts,'$lt':end_ts}, 'topics':topic } count,results = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS) count2,results2 = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS) user_raw_list = [] time_list = [] sublist_by_time=[] city_dict = {} # 测试city_dict是否可用 # city = geo2city('219.224.135.46') # print city,city_dict.setdefault(city,0) # city_dict[city] += 1 # print 'city_dict[city]',city_dict[city] print 'count',count for r in results(): user_raw_list.append(r['user']) # 原始用户列表 time_list.append(r['timestamp']) # 时间列表 sublist_by_time.append((r['timestamp'],r['reposts_count'],r['user'],r['source'],r['text'],r['comments_count'],r['geo'])) # 部分字段构成子列表 city = geo2city(r['geo']) # print city city_dict.setdefault(city,0) city_dict[city] += 1 # 字典相应键值加1 # print 'city_dict[city]',city_dict[city] # print '_id',r['_id'] # print 'user',r['user'] # print 'timestamp',ts2datetime_full(r['timestamp']) # print 'reposts_count',r['reposts_count'] user_count = len(list(set(user_raw_list))) # 去重后的参与人数 print 'user_count',user_count topic_begin_ts = ts2datetime_full(time_list[0]) topic_end_ts = ts2datetime_full(time_list[-1]) print 'topic_begin_ts',topic_begin_ts print 'topic_end_ts',topic_end_ts final_list_by_time = select_by_time(sublist_by_time,O_LIMIT) final_list_by_media =select_by_media(sublist_by_time, M_LIMIT) print 'final_list_by_media' for item in final_list_by_media: print 'timestamp', ts2datetime_full(item['timestamp']) print 'reposts_count',item['reposts_count'] print 'user',item['user'] print 'domain',uid2domain(item['user']) print 'source',item['source'] print 'text',item['text'] print 'comments_count', item['comments_count'] print 'geo', item['geo'] print 'username', item['username'] print 'profile_image_url', item['profile_image_url'] print 'final_list_by_time' for item in final_list_by_time: print 'timestamp', ts2datetime_full(item['timestamp']) print 'reposts_count',item['reposts_count'] print 'user',item['user'] print 'source',item['source'] print 'text',item['text'] print 'comments_count', item['comments_count'] print 'geo', item['geo'] print 'username', item['username'] print 'profile_image_url', item['profile_image_url'] top_city_list = top_city(city_dict) ''' for city in top_city_list: print 'top_city_list',city ''' top_keywords_list = top_keywords(results2, top = K_LIMIT) ''' print 'top_keywords' for keyword in top_keywords_list: print 'keyword',keyword[0].decode('utf-8').encode('utf-8'),keyword[1] ''' save_rt_results(topic, count, user_count, time_list,\ top_city_list, top_keywords_list, final_list_by_time, final_list_by_media)
def news_content(task_source, task_id, news_limit=NEWS_LIMIT): if task_source == 'weibo': query_body = { 'query': { 'bool': { 'should': [{ 'wildcard': { 'text': '*【*】*' } }, { 'wildcard': { 'text': '*#*#*' } }] } }, 'size': news_limit } else: query_body = { 'query': { 'bool': { 'should': [{ 'wildcard': { 'text': '*【*】*' } }, { 'wildcard': { 'text': '*#*' } }] } }, 'sort': 'share', 'size': news_limit } news_results = es_intel.search( index=task_id, doc_type=task_source, body=query_body)['hits']['hits'] #['_source'] # print topic,weibo_index_type,start_ts,end_ts,query_body # print news_results news_list = [] for key_weibo in news_results: text_weibo = key_weibo['_source']['text'] uid = key_weibo['_source']['uid'] timestamp = key_weibo['_source']['timestamp'] comment = key_weibo['_source']['comment'] if task_source == 'weibo': mid_weibo = key_weibo['_source']['mid'] retweeted = key_weibo['_source']['retweeted'] elif task_source == 'facebook': mid_weibo = key_weibo['_source']['fid'] retweeted = key_weibo['_source']['share'] else: mid_weibo = key_weibo['_source']['tid'] retweeted = key_weibo['_source']['share'] news_list.append({ 'news_id': 'news', 'content168': text_weibo, 'id': mid_weibo, 'datetime': ts2datetime_full(timestamp), 'comment': comment, 'retweeted': retweeted }) return news_list