示例#1
0
def subopinion_content(topic, start_ts, end_ts, weibo_limit):
    query_body = {
        'query': {
            'bool': {
                'must_not': [{
                    'wildcard': {
                        'text': '*【*】*'
                    }
                }],
                'must': [{
                    'range': {
                        'timestamp': {
                            'lt': end_ts,
                            'gte': start_ts
                        }
                    }
                }]
            }
        },
        'size': weibo_limit
    }
    subopinion_results = weibo_es.search(
        index=topic, doc_type=weibo_index_type,
        body=query_body)['hits']['hits']  #['_source']
    normal_list = []
    for key_weibo in subopinion_results:
        text_weibo = key_weibo['_source']['text']
        mid_weibo = key_weibo['_source']['mid']
        timestamp = key_weibo['_source']['timestamp']
        try:
            comment = key_weibo['_source']['comment']
        except:
            comment = 0
        try:
            retweeted = key_weibo['_source']['retweeted']
        except:
            retweeted = 0
        uid = key_weibo['_source']['uid']
        normal_list.append({
            'news_id': 'weibo',
            'content': text_weibo,
            'id': mid_weibo,
            'datetime': ts2datetime_full(timestamp),
            'comment': comment,
            'retweeted': retweeted,
            'uid': uid
        })
    return normal_list
示例#2
0
def news_content(topic, start_ts, end_ts, news_limit=NEWS_LIMIT):
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'wildcard': {
                        'text': '*【*】*'
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'lt': end_ts,
                            'gte': start_ts
                        }
                    }
                }]
            }
        },
        'size': news_limit
    }
    news_results = weibo_es.search(
        index=topic, doc_type=weibo_index_type,
        body=query_body)['hits']['hits']  #['_source']
    # print topic,weibo_index_type,start_ts,end_ts,query_body
    # print news_results
    news_list = []
    for key_weibo in news_results:
        text_weibo = key_weibo['_source']['text']
        mid_weibo = key_weibo['_source']['mid']
        timestamp = key_weibo['_source']['timestamp']
        comment = key_weibo['_source']['comment']
        retweeted = key_weibo['_source']['retweeted']
        uid = key_weibo['_source']['uid']
        news_list.append({
            'news_id': 'news',
            'content168': text_weibo,
            'id': mid_weibo,
            'datetime': ts2datetime_full(timestamp),
            'comment': comment,
            'retweeted': retweeted
        })
    return news_list
示例#3
0
def cron_index_topic(topic, begin_ts = BEGIN_TS, end_ts = END_TS):
    if topic and topic != '':
        topic = topic.strip()
        query_dict = {
                'timestamp':{'$gt':begin_ts,'$lt':end_ts},
                'topics':topic
                }

        count,results = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS)
        count2,results2 = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS)
        user_raw_list = []
        time_list = []
        sublist_by_time=[]
        city_dict = {}


        # 测试city_dict是否可用
        # city = geo2city('219.224.135.46')
        # print city,city_dict.setdefault(city,0)
        # city_dict[city] += 1
        # print 'city_dict[city]',city_dict[city]
        print 'count',count

        for r in results():
            user_raw_list.append(r['user']) # 原始用户列表

            time_list.append(r['timestamp']) # 时间列表

            sublist_by_time.append((r['timestamp'],r['reposts_count'],r['user'],r['source'],r['text'],r['comments_count'],r['geo'])) # 部分字段构成子列表

            city = geo2city(r['geo'])
            # print city
            city_dict.setdefault(city,0)
            city_dict[city] += 1 # 字典相应键值加1
            # print 'city_dict[city]',city_dict[city]
            # print '_id',r['_id']
            # print 'user',r['user']
            # print 'timestamp',ts2datetime_full(r['timestamp'])
            # print 'reposts_count',r['reposts_count']

        user_count = len(list(set(user_raw_list))) # 去重后的参与人数
        print 'user_count',user_count

        topic_begin_ts = ts2datetime_full(time_list[0])
        topic_end_ts = ts2datetime_full(time_list[-1])
        print 'topic_begin_ts',topic_begin_ts
        print 'topic_end_ts',topic_end_ts

        final_list_by_time = select_by_time(sublist_by_time,O_LIMIT)
        final_list_by_media =select_by_media(sublist_by_time, M_LIMIT)
        print 'final_list_by_media'
        for item in final_list_by_media:
            print 'timestamp', ts2datetime_full(item['timestamp'])
            print 'reposts_count',item['reposts_count']
            print 'user',item['user']
            print 'domain',uid2domain(item['user'])
            print 'source',item['source']
            print 'text',item['text']
            print 'comments_count', item['comments_count']
            print 'geo', item['geo']
            print 'username', item['username']
            print 'profile_image_url', item['profile_image_url']

        print 'final_list_by_time'
        for item in final_list_by_time:
            print 'timestamp', ts2datetime_full(item['timestamp'])
            print 'reposts_count',item['reposts_count']
            print 'user',item['user']
            print 'source',item['source']
            print 'text',item['text']
            print 'comments_count', item['comments_count']
            print 'geo', item['geo']
            print 'username', item['username']
            print 'profile_image_url', item['profile_image_url']

        top_city_list = top_city(city_dict)
        '''
        for city in top_city_list:
            print 'top_city_list',city
        '''

        top_keywords_list = top_keywords(results2, top = K_LIMIT)
        '''
        print 'top_keywords'
        for keyword in top_keywords_list:
            print 'keyword',keyword[0].decode('utf-8').encode('utf-8'),keyword[1]
        '''
        save_rt_results(topic, count, user_count, time_list,\
                top_city_list, top_keywords_list, final_list_by_time, final_list_by_media)
def news_content(task_source, task_id, news_limit=NEWS_LIMIT):

    if task_source == 'weibo':
        query_body = {
            'query': {
                'bool': {
                    'should': [{
                        'wildcard': {
                            'text': '*【*】*'
                        }
                    }, {
                        'wildcard': {
                            'text': '*#*#*'
                        }
                    }]
                }
            },
            'size': news_limit
        }
    else:
        query_body = {
            'query': {
                'bool': {
                    'should': [{
                        'wildcard': {
                            'text': '*【*】*'
                        }
                    }, {
                        'wildcard': {
                            'text': '*#*'
                        }
                    }]
                }
            },
            'sort': 'share',
            'size': news_limit
        }

    news_results = es_intel.search(
        index=task_id, doc_type=task_source,
        body=query_body)['hits']['hits']  #['_source']
    # print topic,weibo_index_type,start_ts,end_ts,query_body
    # print news_results
    news_list = []
    for key_weibo in news_results:
        text_weibo = key_weibo['_source']['text']
        uid = key_weibo['_source']['uid']
        timestamp = key_weibo['_source']['timestamp']
        comment = key_weibo['_source']['comment']

        if task_source == 'weibo':
            mid_weibo = key_weibo['_source']['mid']
            retweeted = key_weibo['_source']['retweeted']
        elif task_source == 'facebook':
            mid_weibo = key_weibo['_source']['fid']
            retweeted = key_weibo['_source']['share']
        else:
            mid_weibo = key_weibo['_source']['tid']
            retweeted = key_weibo['_source']['share']

        news_list.append({
            'news_id': 'news',
            'content168': text_weibo,
            'id': mid_weibo,
            'datetime': ts2datetime_full(timestamp),
            'comment': comment,
            'retweeted': retweeted
        })
    return news_list