Exemplo n.º 1
0
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list,
                 opinion_type, intel_type):

    query_item = 'text'
    nest_query_list = []
    tweets_list = []
    if task_source == 'weibo':

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)

        else:
            current_time = int(time.time())

        index_name_list = get_flow_text_index_list(current_time, days=5)
        sort_item = 'retweeted'
        for keyword in opinion_keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
        uid_list = []

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if intel_type == 'all':
            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'follow':

            try:
                follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                    id=xnr_user_no)['_source']

                if follow_results:
                    for follow_result in follow_results:
                        uid_list = follow_result['_source']['followers']
            except:
                uid_list = []

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'influence':
            date = ts2datetime(current_time - 24 * 3600)

            if S_TYPE == 'test':
                date = S_DATE_BCI

            weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[
                5:7] + date[8:10]

            query_body_bci = {
                'query': {
                    'match_all': {}
                },
                'sort': {
                    'user_index': {
                        'order': 'desc'
                    }
                },
                'size': 500
            }

            weino_bci_results = es_user_portrait.search(
                index=weibo_bci_index_name,
                doc_type=weibo_bci_index_type,
                body=query_body_bci)['hits']['hits']
            if weino_bci_results:
                for bci_result in weino_bci_results:
                    uid = bci_result['_source']['user']
                    uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        else:

            query_sensitive = {
                'query': {
                    'match_all': {}
                },
                "aggs": {
                    "uids": {
                        "terms": {
                            "field": "uid",
                            "order": {
                                "avg_sensitive": "desc"
                            }
                        },
                        "aggs": {
                            "avg_sensitive": {
                                "avg": {
                                    "field": "sensitive"
                                }
                            }
                        }
                    }
                },
                'size': 500000
            }

            es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body=query_sensitive)['aggregations']['uids']['buckets']
            for item in es_sensitive_result:
                uid = item['key']
                uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        # 得到tweets_list

        tweets_results = es_flow_text.search(index=index_name_list,
                                             doc_type='text',
                                             body=query_body)['hits']['hits']

        if tweets_results:
            for item in tweets_results:
                item = item['_source']
                weibo = item['text']
                tweets_list.append(weibo)

    else:
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time())
        uid_list = []
        sort_item = 'share'
        opinion_keywords_list = [
            word.encode('utf-8') for word in opinion_keywords_list
        ]
        en_keywords_list = trans(opinion_keywords_list, target_language='en')
        for i in range(len(opinion_keywords_list)):
            keyword = opinion_keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(opinion_keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if task_source == 'facebook':
            index_name_list = fb_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source']['fans_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                fb_bci_index_name = fb_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                fb_bci_results = es_xnr.search(
                    index=fb_bci_index_name,
                    doc_type=fb_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                #print 'fb_bci_results...',len(fb_bci_results)
                if fb_bci_results:
                    for bci_result in fb_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                #print 'es_sensitive_result...',len(es_sensitive_result)
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            #print 'query_body...',query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

        else:
            index_name_list = tw_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source'][
                                'followers_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                tw_bci_index_name = tw_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                tw_bci_results = es_xnr.search(
                    index=tw_bci_index_name,
                    doc_type=tw_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                if tw_bci_results:
                    for bci_result in tw_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            print 'index_name_list...', index_name_list
            print 'query_body........', query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

    if tweets_list:
        opinion_name, word_result, text_list = opinion_main(tweets_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        topic_keywords_list = []
        summary_text_list = []

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

            topic_keywords_list.extend(topic_name.split('&'))
            summary_text_list.extend(text)

        #try:
        print 'summary_text_list..', len(summary_text_list)
        print 'topic_keywords_list..', topic_keywords_list
        summary = text_generation_main(summary_text_list, topic_keywords_list)
        #summary = summary_main(summary_text_list)
        #except:
        #    summary = ''

    else:
        sub_opinion_results = {}
        summary = ''

    print '开始保存子观点计算结果......'
    print 'summary....', summary
    mark = save_intelligent_opinion_results(task_id, sub_opinion_results,
                                            summary, intel_type)

    return mark
Exemplo n.º 2
0
def compute_recommend_subopnion(task_detail):

    print '开始分析计算......'

    task_id = task_detail['task_id'].strip('"')

    keywords_string = task_detail['keywords_string']

    keywords_list = keywords_string.split('&')  ## 以 & 切分关键词,得到list

    xnr_user_no = task_detail['xnr_user_no']
    mid = task_detail['mid']

    query_item = 'keywords_string'
    nest_query_list = []
    for keyword in keywords_list:
        nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}})
    '''
    ## 重点关注当前虚拟人的关注用户
    if S_TYPE == 'test':
        # followers_list = get_result['followers_list']
        # nest_query_list.append({'terms':followers_list})
        print '全部用户'
    else:
        get_result = es.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
        id=xnr_user_no)['_source']
        followers_list = get_result['followers_list']
        nest_query_list.append({'terms':followers_list})
    '''

    if S_TYPE == 'test':
        create_time = datetime2ts(S_DATE_FB)
    else:
        create_time = datehour2ts(ts2datehour(time.time() - 3600))

    #tw_get_flow_text_index_list(create_time)

    #index_name_list_list = tw_get_flow_text_index_list(now_timestamp)
    index_name_list = tw_get_flow_text_index_list(create_time)
    print 'index_name_list::', index_name_list
    es_results = es.search(index=index_name_list,doc_type='text',\
                    body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits']

    tw_list = []  ## 内容推荐和子观点分析的输入

    if es_results:
        for item in es_results:
            item = item['_source']
            tw = item['text']
            tw_list.append(tw)

    ## 内容推荐

    ## 得到推荐句子列表
    #print 'tw_list::::::',tw_list
    # print '开始内容推荐计算......'
    # if tw_list:
    #     content_results = summary_main(tw_list)
    # else:
    #     content_results = []

    # print '开始保存内容推荐计算结果......'

    # mark = save_content_recommendation_results(xnr_user_no,mid,task_id.encode('utf-8'),content_results)
    # print 'mark_content:::',mark
    # if mark == False:
    #     print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中'
    #     add_task_2_queue(keyword_task_queue_name,task_detail)
    # else:
    #     print '内容推荐计算结果保存完毕......'

    ## 子观点分析
    '''
    输入:
    tw_data:微博列表,[tw1,tw2,...]
    k_cluster:子话题个数 (默认为5)
    输出:
    opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
    word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
    text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''

    print '开始子观点计算......'
    if tw_list:
        opinion_name, word_result, text_list = opinion_main(tw_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

    else:
        sub_opinion_results = {}

    print '开始保存子观点计算结果......'
    mark = save_subopnion_results(xnr_user_no, mid, task_id,
                                  sub_opinion_results)
    print 'mark_opinion:::', mark
    if mark == False:

        print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中'

        add_task_2_queue(keyword_task_queue_name, task_detail)

    else:
        print '子观点计算结果保存完毕......'
Exemplo n.º 3
0
def comments_rubbish_clustering_calculation(comments, cluster_num, \
        cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \
        version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION):
    """评论垃圾过滤、聚类
       input: comments
           comment中包含news_id, news_content
       cluster_infos: 聚簇信息
       item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 其他类的clusterid
    OTHER_CLUSTER_ID = 'other'

    # 直接显示的clusterid
    DIRECT_CLUSTER_ID = 'direct'
    DIRECT_CLUSTER_FEATURE = [u'聚簇']

    # 最小聚类输入信息条数,少于则不聚类
    MIN_CLUSTERING_INPUT = 20

    # 簇信息,主要是簇的特征词信息
    clusters_infos = {'features': dict()}

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 数据字段预处理
    print('\tData preprocess...')
    inputs = []
    for r in comments:
        r['title'] = ''
        r['content168'] = r['content']  #.encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content']
        if 'news_content' in r and r['news_content']:
            r['news_content'] = r['news_content']  #.encode('utf-8')
        else:
            r['news_content'] = ''

        # 简单规则过滤广告
        item = ad_filter(r)
        if item['ad_label'] == 0:
            inputs.append(item)
        else:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)
    print('\tAd filter %d data, data list have: %d' %
          (len(inputs), len(items_infos)))

    # svm去除垃圾
    print('\tSvm rubbish classify...')
    if len(inputs) == 0:
        items = []
    else:
        items = rubbish_classifier(inputs)
    inputs = []
    for item in items:
        if item['rub_label'] == 1:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)
        else:
            inputs.append(item)
    print('\tSvm rubbish classify %d data, data list have: %d' %
          (len(inputs), len(items_infos)))

    #开始聚类
    print('\tStart clustring opinion...')
    opinion_name, word_result, text_list, word_main = opinion_main(
        inputs, cluster_num)
    # if len(inputs) >= 500:
    #     opinion_name,word_result,text_list = opinion_main(inputs,10)
    # else:
    #     opinion_name,word_result,text_list = opinion_main(inputs,5)
    print('\tEnd clustring opinion...')

    for k, v in word_result.items():
        #name = opinion_name[k]
        clusters_infos['features'][k] = v
    clusters_infos['word_main'] = word_main

    final_inputs = []
    for k, v in text_list.items():
        for item in v:
            row = copy.deepcopy(item)
            row['clusterid'] = k
            final_inputs.append(row)

    # 去重,根据子观点类别去重
    cluster_items = dict()
    for r in final_inputs:
        clusterid = r['clusterid']
        try:
            cluster_items[clusterid].append(r)
        except KeyError:
            cluster_items[clusterid] = [r]

    for clusterid, items in cluster_items.items():
        results = duplicate(items)
        items_infos.extend(results)

    return {'cluster_infos': clusters_infos, 'item_infos': items_infos}