예제 #1
0
파일: utils.py 프로젝트: zhhhzhang/xnr1
def get_symbol_weibo(task_source,
                     task_id,
                     start_ts,
                     end_ts,
                     unit=MinInterval):  #鱼骨图

    if S_TYPE == 'test':
        if task_source == 'weibo':
            start_ts = datetime2ts(S_DATE) - 5 * 24 * 3600
            over_ts = datetime2ts(S_DATE)
        else:
            start_ts = datetime2ts(S_DATE_FB) - 5 * 24 * 3600
            over_ts = datetime2ts(S_DATE_FB)

    weibos = {}

    query_body = {'query': {'bool': {'must': [{'term': {'name': task_id}}]}}}

    #print query_body
    symbol = es_intel.search(index=topics_river_index_name,
                             doc_type=topics_river_index_type,
                             body=query_body)['hits']['hits']
    #print 'symbol..',symbol
    symbol = es_intel.search(index=topics_river_index_name,
                             doc_type=topics_river_index_type,
                             body=query_body)['hits']['hits'][0]['_source']
    #symbol = es.search(index=topics_river_index_name,doc_type=topics_river_index_type,body=query_body)['hits']['hits']
    #print 'symbol:::',symbol
    features = json.loads(symbol['features'])
    symbol_weibos = json.loads(symbol['cluster_dump_dict'])
    #print symbol_weibos
    begin_ts = end_ts - unit
    for clusterid, contents in symbol_weibos.iteritems():
        j = 0
        content = set()
        for i in contents:

            ts = full_datetime2ts(i['datetime'])

            title = re.findall(r'【.*】', i['content'].encode('utf8'))
            if title:
                title = title[0]

                #print 'title::',title.encode('utf-8')
                if ts >= start_ts and ts <= end_ts and title not in content:  #start_ts应该改成begin_ts,现在近15分钟没数据,所以用所有的
                    try:
                        weibos[features[clusterid][0]].append(i)
                    except:
                        weibos[features[clusterid][0]] = [i]
                    content.add(title)
                    j += 1
                #print content
                if j == 3:
                    break
            else:
                continue
    #print weibos
    return weibos
예제 #2
0
def get_topics_river(task_source,
                     task_id,
                     start_ts,
                     end_ts,
                     unit=MinInterval):  #主题河
    #topic='event'
    query_body = {'query': {'bool': {'must': [{'term': {'name': task_id}}]}}}

    news_topics = json.loads(
        es_intel.search(
            index=topics_river_index_name,
            doc_type=topics_river_index_type,
            body=query_body)['hits']['hits'][0]['_source']['features'])
    #print 'news_topics---------------11111111111111111111111111111111111111111111'
    #print news_topics
    zhutihe_results = cul_key_weibo_time_count(task_source, task_id,
                                               news_topics, start_ts, end_ts,
                                               unit)
    #print 'zhutihe_results---------------11111111111111111111111111111111111111111111'
    #print zhutihe_results
    results = {}
    for k, v in news_topics.iteritems():
        if len(v) > 0:
            results[v[0]] = zhutihe_results[k]
    return results
예제 #3
0
파일: utils.py 프로젝트: zhhhzhang/xnr1
def cul_key_weibo_time_count(task_source, task_id, news_topics, start_ts,
                             over_ts, during):

    if S_TYPE == 'test':
        if task_source == 'weibo':
            start_ts = datetime2ts(S_DATE) - 5 * 24 * 3600
            over_ts = datetime2ts(S_DATE)
        else:
            start_ts = datetime2ts(S_DATE_FB) - 5 * 24 * 3600
            over_ts = datetime2ts(S_DATE_FB)

    key_weibo_time_count = {}
    time_dict = {}
    during = Day
    for clusterid, keywords in news_topics.iteritems(
    ):  #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']}
        if len(keywords) > 0:
            start_ts = int(start_ts)
            over_ts = int(over_ts)

            over_ts = ts2HourlyTime(over_ts, during)
            interval = (over_ts - start_ts) / during

            for i in range(interval, 0, -1):  #时间段取每900秒的

                begin_ts = over_ts - during * i
                end_ts = begin_ts + during
                must_list = []
                must_list.append(
                    {'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }})
                temp = []
                for word in keywords:
                    sentence = {
                        'wildcard': {
                            'keywords_string': '*' + word + '*'
                        }
                    }
                    temp.append(sentence)
                must_list.append({'bool': {'should': temp}})

                query_body = {'query': {'bool': {'must': must_list}}}
                key_weibo = es_intel.search(index=task_id,
                                            doc_type=task_source,
                                            body=query_body)
                key_weibo_count = key_weibo['hits']['total']  #分时间段的类的数量
                time_dict[ts2datetime(end_ts)] = key_weibo_count

            key_weibo_time_count[clusterid] = sorted(time_dict.items(),
                                                     key=lambda x: x[0])
    return key_weibo_time_count