Пример #1
0
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list,
                 opinion_type, intel_type):

    query_item = 'text'
    nest_query_list = []
    tweets_list = []
    if task_source == 'weibo':

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)

        else:
            current_time = int(time.time())

        index_name_list = get_flow_text_index_list(current_time, days=5)
        sort_item = 'retweeted'
        for keyword in opinion_keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
        uid_list = []

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if intel_type == 'all':
            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'follow':

            try:
                follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                    id=xnr_user_no)['_source']

                if follow_results:
                    for follow_result in follow_results:
                        uid_list = follow_result['_source']['followers']
            except:
                uid_list = []

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'influence':
            date = ts2datetime(current_time)

            if S_TYPE == 'test':
                date = S_DATE_BCI

            weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[
                5:7] + date[8:10]

            query_body_bci = {
                'query': {
                    'match_all': {}
                },
                'sort': {
                    'user_index': {
                        'order': 'desc'
                    }
                },
                'size': 500
            }

            weino_bci_results = es_user_portrait.search(
                index=weibo_bci_index_name,
                doc_type=weibo_bci_index_type,
                body=query_body_bci)['hits']['hits']
            if weino_bci_results:
                for bci_result in weino_bci_results:
                    uid = bci_result['_source']['user']
                    uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        else:

            query_sensitive = {
                'query': {
                    'match_all': {}
                },
                "aggs": {
                    "uids": {
                        "terms": {
                            "field": "uid",
                            "order": {
                                "avg_sensitive": "desc"
                            }
                        },
                        "aggs": {
                            "avg_sensitive": {
                                "avg": {
                                    "field": "sensitive"
                                }
                            }
                        }
                    }
                },
                'size': 500
            }

            es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body=query_sensitive)['aggregations']['uids']['buckets']
            for item in es_sensitive_result:
                uid = item['key']
                uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        # 得到tweets_list

        tweets_results = es_flow_text.search(index=index_name_list,
                                             doc_type='text',
                                             body=query_body)['hits']['hits']

        if tweets_results:
            for item in tweets_results:
                item = item['_source']
                weibo = item['text']
                tweets_list.append(weibo)

    else:
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time())
        uid_list = []
        sort_item = 'share'
        opinion_keywords_list = [
            word.encode('utf-8') for word in opinion_keywords_list
        ]
        en_keywords_list = trans(opinion_keywords_list, target_language='en')
        for i in range(len(opinion_keywords_list)):
            keyword = opinion_keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(opinion_keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if task_source == 'facebook':
            index_name_list = fb_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source']['fans_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                fb_bci_index_name = fb_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                fb_bci_results = es_xnr.search(
                    index=fb_bci_index_name,
                    doc_type=fb_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                #print 'fb_bci_results...',len(fb_bci_results)
                if fb_bci_results:
                    for bci_result in fb_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                #print 'es_sensitive_result...',len(es_sensitive_result)
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            #print 'query_body...',query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

        else:
            index_name_list = tw_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source'][
                                'followers_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                tw_bci_index_name = tw_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                tw_bci_results = es_xnr.search(
                    index=tw_bci_index_name,
                    doc_type=tw_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                if tw_bci_results:
                    for bci_result in tw_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            print 'index_name_list...', index_name_list
            print 'query_body........', query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

    if tweets_list:
        opinion_name, word_result, text_list = opinion_main(tweets_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        topic_keywords_list = []
        summary_text_list = []

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

            topic_keywords_list.extend(topic_name.split('&'))
            summary_text_list.extend(text)

        #try:
        print 'summary_text_list..', len(summary_text_list)
        print 'topic_keywords_list..', topic_keywords_list
        summary = text_generation_main(summary_text_list, topic_keywords_list)
        #summary = summary_main(summary_text_list)
        #except:
        #    summary = ''

    else:
        sub_opinion_results = {}
        summary = ''

    print '开始保存子观点计算结果......'
    print 'summary....', summary
    mark = save_intelligent_opinion_results(task_id, sub_opinion_results,
                                            summary, intel_type)

    return mark
Пример #2
0
def read_tracing_followers_tweet():

    if S_TYPE == 'test':
        query_body = {
            'query': {
                'term': {
                    'xnr_user_no': 'WXNR0004'
                }
            },
            'size': MAX_SEARCH_SIZE
        }

    else:
        query_body = {'query': {'match_all': {}}, 'size': MAX_SEARCH_SIZE}


    results = es_xnr.search(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                body=query_body)['hits']['hits']
    if results:

        for result in results:
            result = result['_source']
            #print 'result..',result
            try:
                xnr_user_no = result['xnr_user_no']
            except:
                xnr_user_no = result['xnr_use_no']
            if not xnr_user_no:
                continue
            print 'result...', result
            trace_follow_list = result['trace_follow_list']

            if S_TYPE == 'test':
                current_time = datetime2ts(S_DATE)
                #trace_follow_list = TRACE_FOLLOW_LIST
            else:
                current_time = int(time.time())

            current_date = ts2datetime(current_time)

            flow_text_index_name = flow_text_index_name_pre + current_date

            query_body_flow = {
                'query': {
                    'filtered': {
                        'filter': {
                            'terms': {
                                'uid': trace_follow_list
                            }
                        }
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

            results_flow = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\
                            body=query_body_flow)['hits']['hits']

            if results_flow:
                for result_flow in results_flow:

                    result_flow = result_flow['_source']
                    mid = result_flow['mid']

                    #先判断 之前是否已经存过该mid

                    task_id = xnr_user_no + '_' + mid
                    try:
                        # 如果已添加则跳过
                        es_xnr.get(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\
                            weibo_xnr_retweet_timing_list_index_type,id=task_id)['_source']
                        continue

                    except:
                        # 如果未添加过则加入列表
                        task_detail = {}
                        task_detail['xnr_user_no'] = xnr_user_no
                        task_detail['mid'] = mid
                        task_detail['text'] = result_flow['text']
                        task_detail['uid'] = result_flow['uid']
                        task_detail['nick_name'], task_detail[
                            'photo_url'] = uid2nick_name_photo(
                                result_flow['uid'])
                        task_detail['timestamp'] = result_flow['timestamp']
                        task_detail['timestamp_set'] = result_flow[
                            'timestamp'] + random.randint(
                                RETWEET_START_TS, RETWEET_END_TS)
                        task_detail['compute_status'] = 0

                        es_xnr.index(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\
                            weibo_xnr_retweet_timing_list_index_type,body=task_detail,id=task_id)
Пример #3
0
def read_flow_text(flow_text_index_name, current_date):

    #flow_text_index_name = facebook_flow_text_index_name_pre + current_date

    i = 0

    label_count_dict = {}
    content_dict = {}

    print '!!!'

    while True:

        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'sensitive': 0
                        }
                    }]
                }
            },
            'size': 1000,
            'from': i * 1000
        }

        # 原创、sensitive为0
        #print '222'
        search_results = es_xnr.search(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\
          body=query_body)['hits']['hits']

        weibo_list = []

        for result in search_results:
            result = result['_source']
            weibo_list.append(result['text'].encode('utf-8'))

        label_list = triple_classifier_new(weibo_list)

        label_count = Counter(label_list)
        #print '333'
        for j in range(len(search_results)):

            label = label_list[j]

            try:
                if label_count_dict[label] < 20:
                    content_dict[label].append(search_results[j]['_source'])
                    label_count_dict[label] += 1

            except:
                content_dict[label] = [search_results[j]['_source']]

                label_count_dict[label] = 1

        i += 1

        if i % 1000 == 0:
            print 'i...', i
            print 'label_count_dict...', label_count_dict

        # 循环终止条件
        min_label_count = min(label_count_dict, key=label_count_dict.get)
        if label_count_dict[min_label_count] >= 20:
            break
    print 'label_count_dict::', label_count_dict

    for content_label, content_weibo in content_dict.iteritems():
        _id = content_label
        index_name = fb_daily_interest_index_name_pre + '_' + current_date
        fb_daily_inerests_flow_text_mappings(index_name)
        item_dict = {}
        item_dict['timestamp'] = datetime2ts(current_date)
        item_dict['content'] = json.dumps(content_weibo)
        print es_xnr.index(index=index_name,
                           doc_type=fb_daily_interest_index_type,
                           id=_id,
                           body=item_dict)

        print content_label, '====', len(content_weibo)
Пример #4
0
def get_trace_follow_operate(xnr_user_no, uid_string, nick_name_string):

    mark = False
    fail_nick_name_list = []
    if uid_string:
        uid_list = uid_string.encode('utf-8').split(',')

    elif nick_name_string:
        nick_name_list = nick_name_string.encode('utf-8').split(',')
        uid_list = []

        for nick_name in nick_name_list:
            query_body = {
                'query': {
                    'filtered': {
                        'filter': {
                            'term': {
                                'nick_name': nick_name
                            }
                        }
                    }
                },
                '_source': ['uid']
            }
            try:
                uid_results = es.search(index=facebook_user_index_name,doc_type=facebook_user_index_type,\
                            body=query_body)['hits']['hits']

                uid_result = uid_result[0]['_source']
                uid = uid_result['uid']
                uid_list.append(uid)

            except:
                fail_nick_name_list.append(nick_name)

    try:
        result = es.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

        try:
            trace_follow_list = result['trace_follow_list']
        except:
            trace_follow_list = []

        try:
            followers_list = result['fans_list']
        except:
            followers_list = []

        trace_follow_list = list(set(trace_follow_list) | set(uid_list))

        followers_list = list(set(followers_list) | set(uid_list))

        es.update(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                    id=xnr_user_no,body={'doc':{'trace_follow_list':trace_follow_list,'fans_list':followers_list}})

        mark = True

    except:

        item_exists = {}

        item_exists['xnr_user_no'] = xnr_user_no
        item_exists['trace_follow_list'] = uid_list
        item_exists['fans_list'] = uid_list

        es.index(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                    id=xnr_user_no,body=item_exists)

        mark = True

    return [mark, fail_nick_name_list]
Пример #5
0
def get_hot_sensitive_recommend_at_user(sort_item):

    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_FB)
    else:
        now_ts = int(time.time())
    datetime = ts2datetime(now_ts - 24 * 3600)

    #sort_item = 'sensitive'
    sort_item_2 = 'timestamp'
    index_name = facebook_flow_text_index_name_pre + datetime

    query_body = {
        'query': {
            'match_all': {}
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': HOT_EVENT_TOP_USER,
        '_source': ['uid', 'user_fansnum', 'retweeted', 'timestamp']
    }

    # if sort_item == 'retweeted':
    #     sort_item_2 = 'timestamp'
    # else:
    #     sort_item_2 = 'retweeted'

    es_results = es.search(index=index_name,
                           doc_type=facebook_flow_text_index_type,
                           body=query_body)['hits']['hits']

    uid_fansnum_dict = dict()
    if es_results:
        for result in es_results:
            result = result['_source']
            uid = result['uid']
            uid_fansnum_dict[uid] = {}
            uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2]

    uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(),
                                       key=lambda x: x[1][sort_item_2],
                                       reverse=True)

    uid_set = set()

    for item in uid_fansnum_dict_sort_top:
        uid_set.add(item[0])

    uid_list = list(uid_set)

    ## 根据uid,从weibo_user中得到 nick_name
    uid_nick_name_dict = dict()  # uid不会变,而nick_name可能会变
    es_results_user = es.mget(index=facebook_user_index_name,
                              doc_type=facebook_user_index_type,
                              body={'ids': uid_list})['docs']
    i = 0
    for result in es_results_user:
        if result['found'] == True:
            result = result['_source']
            uid = result['uid']
            nick_name = result['name']
            if nick_name:
                i += 1
                uid_nick_name_dict[uid] = nick_name
        if i >= HOT_AT_RECOMMEND_USER_TOP:
            break

    return uid_nick_name_dict
Пример #6
0
     d = r.get(self.wxbot_id)
     if d:
         try:
             wx_id = eval(d)['wx_id']
             wxbot_port = eval(d)['wxbot_port']
             submitter = eval(d)['submitter']
             mail = eval(d)['mail']
             access_id = eval(d)['access_id']
             remark = eval(d)['remark']
             break
         except Exception, e:
             print e
 #check if already exist
 query_body_wx_exist = {'query': {'term': {'wx_id': wx_id}}}
 search_result = es_xnr.search(index=wx_xnr_index_name,
                               doc_type=wx_xnr_index_type,
                               body=query_body_wx_exist)['hits']['hits']
 if search_result:
     #更改xnr信息并保存到es中
     pass
 else:
     #print 'save_bot_info'
     wxxnr_data = {
         'wx_id': wx_id,
         'puid': self.self.puid,
         'user_no': wxbot_id2user_no(self.wxbot_id),
         'xnr_user_no': self.wxbot_id,
         'wxbot_port': wxbot_port,
         'create_ts': int(time.time()),
         'nickname': self.self.name,
         'remark': remark,
Пример #7
0
def create_date_warning(today_datetime):
    query_body = {
        'query': {
            'match_all': {}
        },
        'size': MAX_VALUE,
        'sort': {
            'date_time': {
                'order': 'asc'
            }
        }
    }
    try:
        result = es_xnr.search(index=weibo_date_remind_index_name,
                               doc_type=weibo_date_remind_index_type,
                               body=query_body)['hits']['hits']
        date_result = []
        for item in result:
            #计算距离日期
            date_time = item['_source']['date_time']
            year = ts2yeartime(today_datetime)
            warming_date = year + '-' + date_time
            today_date = ts2datetime(today_datetime)
            countdown_num = (datetime2ts(warming_date) -
                             datetime2ts(today_date)) / DAY

            if abs(countdown_num) < WARMING_DAY:
                #根据给定的关键词查询预警微博
                print 'date_time:', date_time
                keywords = item['_source']['keywords']
                date_warming = lookup_twitter_date_warming(
                    keywords, today_datetime)
                item['_source']['twitter_date_warming_content'] = json.dumps(
                    date_warming)
                item['_source']['validity'] = 0
                item['_source']['timestamp'] = today_datetime

                task_id = str(
                    item['_source']['create_time']) + '_' + str(today_datetime)
                #print 'task_id',task_id
                #print 'date_warming',date_warming
                #写入数据库

                twitter_timing_warning_index_name = twitter_timing_warning_index_name_pre + warming_date

                if date_warming:
                    print twitter_timing_warning_index_name
                    try:

                        es_xnr_2.index(
                            index=twitter_timing_warning_index_name,
                            doc_type=twitter_timing_warning_index_name,
                            body=item['_source'],
                            id=task_id)
                        mark = True
                    except:
                        mark = False
                else:
                    pass

                date_result.append(mark)
        else:
            pass

    except:
        date_result = []
    return date_result
Пример #8
0
def xnr_keywords_compute(xnr_user_no):
    #查询好友列表
    followers_list = lookup_xnr_concernedusers(xnr_user_no)
    lookup_condition_list = []
    print 'xnr_user_no, followers_list:', xnr_user_no, followers_list
    lookup_condition_list.append({
        'filtered': {
            'filter': {
                'bool': {
                    'must': {
                        'terms': {
                            'uid': followers_list
                        }
                    }
                }
            }
        }
    })

    #根据日期确定查询表
    if S_TYPE == 'test':
        date_time = test_date
    else:
        now_time = int(time.time())
        date_time = ts2datetime(now_time)
    flow_text_index_name = twitter_flow_text_index_name_pre + date_time

    #按日期统计
    # print lookup_condition_list
    for item_condition in lookup_condition_list:
        query_body = {
            'query': item_condition,
            'aggs': {
                'keywords': {
                    'terms': {
                        'field': 'keywords_string',
                        'size': 1000
                    }
                }
            }
        }

        flow_text_exist=es_xnr.search(index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\
               body=query_body)['aggregations']['keywords']['buckets']

        # print 'flow_text_exist:',flow_text_exist
        word_dict = dict()

        word_dict_new = dict()

        keywords_string = ''
        for item in flow_text_exist:
            word = item['key']
            count = item['doc_count']
            word_dict[word] = count

            keywords_string += '&'
            keywords_string += item['key']

        k_dict = extract_keywords(keywords_string)

        for item_item in k_dict:
            keyword = item_item.word
            # print 'keyword::',type(keyword)
            word_dict_new[keyword] = word_dict[keyword]

    return word_dict_new
Пример #9
0
def query_mid_list(ts, social_sensors, time_segment, message_type=1):
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "uid": social_sensors
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            "sentiment": {
                "order": "desc"
            }
        },
        "size": 10000
    }

    mid_dict = dict()

    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts - 24 * 3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        search_results = es_text.search(index=index_list,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []
    origin_mid_list = set()
    if search_results:
        for item in search_results:
            origin_mid_list.add(item["_id"])
            # if message_type == 1:
            #     origin_mid_list.add(item["_id"])
            # else:
            #     origin_mid_list.add(item['_source']['root_mid'])
            #     mid_dict[item['_source']['root_mid']] = item["_id"] # 源头微博和当前转发微博的mid

    # if message_type != 1:
    # # 保证获取的源头微博能在最近两天内找到
    #     filter_list = []
    #     filter_mid_dict = dict()
    #     for iter_index in index_list:
    #         exist_es = es_text.mget(index=iter_index, doc_type="text", body={"ids":list(origin_mid_list)})["docs"]
    #         print 'es_text...',es_text
    #         print 'index_list..',index_list
    #         for item in exist_es:
    #             if item["found"]:
    #                 filter_list.append(item["_id"])
    #                 filter_mid_dict[item["_id"]] = mid_dict[item["_id"]]
    #     origin_mid_list = filter_list
    #     mid_dict = filter_mid_dict
    return list(origin_mid_list), mid_dict
Пример #10
0
def social_sensing():

    all_tid_list, end_ts = count_statis()

    if S_TYPE == 'test':
        all_tid_list = ALL_TID_LIST

    index_list = []
    for i in range(7):
        timestamp = end_ts - i * DAY
        flow_text_index_name = flow_text_index_name_pre + ts2datetime(
            timestamp)
        index_list.append(flow_text_index_name)
    #index_list = [flow_text_index_name_pre+date_1,flow_text_index_name_pre+date_2]
    print 'index_list...', index_list
    # 感知到的事, all_tid_list
    sensitive_text_list = []
    tmp_sensitive_warning = ""
    text_dict = dict()  # 文本信息
    tid_value = dict()  # 文本赋值
    duplicate_dict = dict()  # 重合字典
    portrait_dict = dict()  # 背景信息
    classify_text_dict = dict()  # 分类文本
    classify_uid_list = []
    classify_tid_list = []
    duplicate_text_list = []
    sensitive_words_dict = dict()
    sensitive_weibo_detail = {}
    all_text_dict = dict()
    tid_ts_dict = dict()  # 文本发布时间

    # 有事件发生时开始
    #if 1:

    if index_list and all_tid_list:
        query_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "terms": {
                            "tid": all_tid_list
                        }
                    }
                }
            },
            "size": 5000
        }
        search_results = es.search(index=index_list,
                                   doc_type="text",
                                   body=query_body)['hits']['hits']
        print "search tid len: ", len(search_results)

        if search_results:
            for item in search_results:
                iter_uid = item['_source']['uid']
                iter_tid = item['_source']['tid']
                tid_ts_dict[iter_tid] = item["_source"]["timestamp"]
                iter_text = item['_source']['text'].encode('utf-8', 'ignore')
                iter_sensitive = item['_source'].get('sensitive', 0)
                tmp_text = get_weibo(item['_source'])
                all_text_dict[iter_tid] = tmp_text

                duplicate_text_list.append({
                    "_id":
                    iter_tid,
                    "title":
                    "",
                    "content":
                    iter_text.decode("utf-8", 'ignore')
                })

                if iter_sensitive:
                    tmp_sensitive_warning = signal_sensitive_variation  #涉及到敏感词的微博
                    sensitive_words_dict[iter_tid] = iter_sensitive

                keywords_dict = json.loads(item['_source']['keywords_dict'])
                personal_keywords_dict = dict()
                for k, v in keywords_dict.iteritems():
                    k = k.encode('utf-8', 'ignore')
                    personal_keywords_dict[k] = v
                classify_text_dict[iter_tid] = personal_keywords_dict
                #classify_uid_list.append(iter_uid)
                classify_tid_list.append(iter_tid)

            # 去重
            print "start duplicate"
            if duplicate_text_list:
                dup_results = duplicate(duplicate_text_list)
                for item in dup_results:
                    if item['duplicate']:
                        duplicate_dict[item['_id']] = item['same_from']

            # 分类
            print "start classify"
            tid_value = dict()
            if classify_text_dict:
                #classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                classify_results = topic_classfiy(classify_tid_list,
                                                  classify_text_dict)

                #print "classify_results: ", classify_results

                for k, v in classify_results.iteritems():  # tid:value
                    #tid_value[k] = topic_value_dict[v[0]]
                    tid_value[k] = v[0]

    # organize data

    tid_list = all_text_dict.keys()
    print "final tid:", len(tid_list)
    print "intersection: ", len(set(tid_list) & set(all_tid_list))

    bulk_action = []
    count = 0

    #social_sensing_index_name = "tw_social_sensing_text_" + ts2datetime(end_ts)
    social_sensing_index_name = "tw_social_sensing_text"
    mappings_social_sensing_text(social_sensing_index_name)

    for tid in tid_list:
        iter_dict = dict()

        if duplicate_dict.has_key(tid):
            iter_dict["duplicate"] = duplicate_dict[tid]
        else:
            iter_dict["duplicate"] = ""

        iter_dict["compute_status"] = 0  # 尚未计算
        iter_dict["topic_field"] = tid_value[tid]
        iter_dict["detect_ts"] = end_ts
        #iter_dict["xnr_user_no"] = xnr_user_no

        iter_dict.update(all_text_dict[tid])
        count += 1
        print 'iter_dict:::', iter_dict
        # _id = xnr_user_no + '_' + tid
        bulk_action.extend([{"index": {"_id": tid}}, iter_dict])
        if count % 500 == 0:
            es.bulk(bulk_action,
                    index=social_sensing_index_name,
                    doc_type="text",
                    timeout=600)
            bulk_action = []

    if bulk_action:
        es.bulk(bulk_action,
                index=social_sensing_index_name,
                doc_type="text",
                timeout=600)

    return "1"
Пример #11
0
def count_statis():

    end_ts = int(time.time())

    if S_TYPE == 'test':
        end_ts = datetime2ts(S_DATE_FB)

    start_ts = end_ts - 12 * 3600

    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'range': {
                        'update_time': {
                            'gt': start_ts,
                            'lte': end_ts
                        }
                    }
                }]
            }
        },
        'aggs': {
            'all_tids': {
                'terms': {
                    'field': 'tid',
                    'order': {
                        'stats_share.max': 'desc'
                    },
                    'size': MAX_SIZE
                },
                'aggs': {
                    'stats_share': {
                        'stats': {
                            'field': 'share'
                        }
                    }
                }
            }
        }
    }

    twitter_count_index_name_1 = twitter_count_index_name_pre + ts2datetime(
        end_ts)
    twitter_count_index_name_2 = twitter_count_index_name_pre + ts2datetime(
        end_ts - DAY)
    twitter_count_index_name_list = [
        twitter_count_index_name_1, twitter_count_index_name_2
    ]

    print 'twitter_count_index_name_list...', twitter_count_index_name_list

    results = es.search(index=twitter_count_index_name_list ,doc_type='text',\
        body=query_body)['aggregations']['all_tids']['buckets']

    results_origin = copy.deepcopy(results)

    print 'start count aggs sort...'

    results.sort(key=lambda x:
                 (x['stats_share']['max'] - x['stats_share']['min']),
                 reverse=True)

    tid_list = [
        item['key'] for item in results
        if (item['stats_share']['max'] -
            item['stats_share']['min']) >= HOT_LOWWER
    ]

    if len(tid_list) < TOP_HOT_FB:

        tid_list_2 = [
            item['key'] for item in results_origin[:TOP_HOT_FB - len(tid_list)]
        ]

        tid_list.extend(tid_list_2)

    print 'all tid_list over...'
    print 'len..tid_list...', tid_list

    return tid_list, end_ts
Пример #12
0
def create_event_warning(xnr_user_no, today_datetime, write_mark):
    #获取事件名称
    hashtag_list = get_hashtag(today_datetime)
    #print 'hashtag_list/:',hashtag_list

    facebook_flow_text_index_name = get_timets_set_indexset_list(
        facebook_flow_text_index_name_pre, today_datetime, today_datetime)

    #虚拟人的好友列表
    friends_list = lookup_xnr_friends(xnr_user_no)

    event_warming_list = []
    for event_item in hashtag_list:
        event_warming_content = dict()  #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间
        event_warming_content['event_name'] = event_item['event_name']
        event_influence_sum = 0
        event_time_sum = 0
        query_body = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'hashtag': event_item['event_name']
                                }
                            }, {
                                'range': {
                                    'sensitive': {
                                        'gte': 1
                                    }
                                }
                            }]
                        }
                    }
                }
            },
            'size': MAX_WARMING_SIZE,
            'sort': {
                'sensitive': {
                    'order': 'desc'
                }
            }
        }
        event_results = es_xnr.search(index=facebook_flow_text_index_name,
                                      doc_type=facebook_flow_text_index_type,
                                      body=query_body)['hits']['hits']
        if event_results:
            facebook_result = []
            friends_num_dict = dict()
            alluser_num_dict = dict()
            #print 'sencond_time:::',int(time.time())
            for item in event_results:
                #查询三个指标字段
                fid_result = lookup_fid_attend_index(item['_source']['fid'],
                                                     today_datetime)
                if fid_result:
                    item['_source']['comment'] = fid_result['comment']
                    item['_source']['share'] = fid_result['share']
                    item['_source']['favorite'] = fid_result['favorite']
                else:
                    item['_source']['comment'] = 0
                    item['_source']['share'] = 0
                    item['_source']['favorite'] = 0
                #print 'event_content:',item['_source']['text']
                #统计用户信息
                if alluser_num_dict.has_key(str(item['_source']['uid'])):
                    friends_mark = set_intersection(item['_source']['uid'],
                                                    friends_list)
                    if friends_mark > 0:
                        alluser_num_dict[str(
                            item['_source']['uid'])] = alluser_num_dict[str(
                                item['_source']['uid'])] + 1 * 2
                    else:
                        alluser_num_dict[str(
                            item['_source']['uid'])] = alluser_num_dict[str(
                                item['_source']['uid'])] + 1
                else:
                    alluser_num_dict[str(item['_source']['uid'])] = 1

                #计算影响力
                origin_influence_value = (1 + item['_source']['comment'] +
                                          item['_source']['share'] +
                                          item['_source']['favorite']) * (
                                              1 + item['_source']['sensitive'])
                friends_value = judge_user_type(item['_source']['uid'],
                                                friends_list)
                item['_source'][
                    'facebook_influence_value'] = origin_influence_value * friends_value

                #查询用户昵称
                item['_source']['nick_name'] = get_user_nickname(
                    item['_source']['uid'])
                facebook_result.append(item['_source'])

                #统计影响力、时间
                event_influence_sum = event_influence_sum + item['_source'][
                    'facebook_influence_value']
                event_time_sum = event_time_sum + item['_source']['timestamp']

            # print 'third_time:::',int(time.time())
            #典型信息
            facebook_result.sort(key=lambda k:
                                 (k.get('facebook_influence_value', 0)),
                                 reverse=True)
            event_warming_content['main_facebook_info'] = json.dumps(
                facebook_result)

            #事件影响力和事件时间
            number = len(event_results)
            event_warming_content[
                'event_influence'] = event_influence_sum / number
            event_warming_content['event_time'] = event_time_sum / number

            #对用户进行排序
            alluser_num_dict = sorted(alluser_num_dict.items(),
                                      key=lambda d: d[1],
                                      reverse=True)
            main_userid_list = []
            for i in xrange(0, len(alluser_num_dict)):
                main_userid_list.append(alluser_num_dict[i][0])

        #主要参与用户信息
            main_user_info = []
            user_es_result = es_xnr.mget(index=facebook_user_index_name,
                                         doc_type=facebook_user_index_type,
                                         body={'ids':
                                               main_userid_list})['docs']
            # print 'user_es_result:',user_es_result
            for item in user_es_result:

                user_dict = dict()
                if item['found']:
                    user_dict['uid'] = item['_id']
                    user_dict['username'] = item['_source']['username']
                    if item['_source'].has_key('talking_about_count'):
                        user_dict['talking_about_count'] = item['_source'][
                            'talking_about_count']
                    else:
                        user_dict['talking_about_count'] = 0
                    if item['_source'].has_key('likes'):
                        user_dict['likes'] = item['_source']['likes']
                    else:
                        user_dict['likes'] = 0
                    if item['_source'].has_key('category'):
                        user_dict['category'] = item['_source']['category']
                    else:
                        user_dict['category'] = ''
                else:
                    # user_dict['icon']=''
                    user_dict['uid'] = item['_id']
                    user_dict['username'] = ''
                    user_dict['talking_about_count'] = 0
                    user_dict['likes'] = 0
                    user_dict['category'] = ''
                main_user_info.append(user_dict)
            event_warming_content['main_user_info'] = json.dumps(
                main_user_info)

            # print 'fourth_time:::',int(time.time())
            event_warming_content['xnr_user_no'] = xnr_user_no
            event_warming_content['validity'] = 0
            event_warming_content['timestamp'] = today_datetime
            now_time = int(time.time())
            # task_id=xnr_user_no+'_'+str(now_time)
            task_id = xnr_user_no + '_' + event_warming_content['event_name']

            #写入数据库
            if write_mark:
                # print 'today_datetime:::',ts2datetime(today_datetime)
                print 'task_id_event:', task_id
                mark = write_envent_warming(today_datetime,
                                            event_warming_content, task_id)
                event_warming_list.append(mark)
            else:
                event_warming_list.append(event_warming_content)

        else:
            pass
        # print 'fifth_time:::',int(time.time())
    return event_warming_list
Пример #13
0
def create_personal_warning(xnr_user_no, today_datetime):
    #查询好友列表
    friends_list = lookup_xnr_friends(xnr_user_no)

    #查询虚拟人uid
    xnr_uid = lookup_xnr_uid(xnr_user_no)

    #计算敏感度排名靠前的用户
    query_body = {
        # 'query':{
        #     'filtered':{
        #         'filter':{
        #             'terms':{'uid':friends_list}
        #         }
        #     }
        # },
        'aggs': {
            'friends_sensitive_num': {
                'terms': {
                    'field': 'uid'
                },
                'aggs': {
                    'sensitive_num': {
                        'sum': {
                            'field': 'sensitive'
                        }
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE
    }

    facebook_flow_text_index_name = get_timets_set_indexset_list(
        facebook_flow_text_index_name_pre, today_datetime, today_datetime)

    try:
        first_sum_result=es_xnr.search(index=facebook_flow_text_index_name,doc_type=facebook_flow_text_index_type,\
        body=query_body)['aggregations']['friends_sensitive_num']['buckets']
    except:
        first_sum_result = []

    #print 'first_sum_result',first_sum_result
    top_userlist = []
    for i in xrange(0, len(first_sum_result)):
        user_sensitive = first_sum_result[i]['sensitive_num']['value']
        if user_sensitive > 0:
            user_dict = dict()
            user_dict['uid'] = first_sum_result[i]['key']
            friends_mark = judge_user_type(user_dict['uid'], friends_list)
            user_dict['sensitive'] = user_sensitive * friends_mark
            top_userlist.append(user_dict)
        else:
            pass
    #####################
    #如果是好友,则用户敏感度计算值增加1.5倍
    #####################
    #查询敏感用户的敏感内容
    results = []
    for user in top_userlist:
        #print user
        user_detail = dict()
        user_detail['uid'] = user['uid']
        user_detail['user_sensitive'] = user['sensitive']
        user_lookup_id = user['uid']
        print user_lookup_id
        # try:
        #     #user_result=es_xnr.get(index=facebook_feedback_friends_index_name,doc_type=facebook_feedback_friends_index_type,id=user_lookup_id)['_source']
        #     user_result=es_xnr.get(index=facebook_user_index_name,doc_type=facebook_user_index_type,id=user['uid'])['_source']
        #     user_detail['user_name']=user_result['nick_name']
        # except:
        #     user_detail['user_name']=''
        user_detail['user_name'] = get_user_nickname(user['uid'])

        query_body = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'uid': user['uid']
                                }
                            }, {
                                'range': {
                                    'sensitive': {
                                        'gte': 1
                                    }
                                }
                            }]
                        }
                    }
                }
            },
            'size': MAX_WARMING_SIZE,
            'sort': {
                'sensitive': {
                    'order': 'desc'
                }
            }
        }

        try:
            second_result = es_xnr.search(
                index=facebook_flow_text_index_name,
                doc_type=facebook_flow_text_index_type,
                body=query_body)['hits']['hits']
        except:
            second_result = []

        s_result = []
        for item in second_result:
            #查询三个指标字段
            fid_result = lookup_fid_attend_index(item['_source']['fid'],
                                                 today_datetime)
            if fid_result:
                item['_source']['comment'] = fid_result['comment']
                item['_source']['share'] = fid_result['share']
                item['_source']['favorite'] = fid_result['favorite']
            else:
                item['_source']['comment'] = 0
                item['_source']['share'] = 0
                item['_source']['favorite'] = 0
            #查询用户昵称
            item['_source']['nick_name'] = get_user_nickname(
                item['_source']['uid'])

            s_result.append(item['_source'])

        s_result.sort(key=lambda k: (k.get('sensitive', 0)), reverse=True)
        user_detail['content'] = json.dumps(s_result)

        user_detail['xnr_user_no'] = xnr_user_no
        user_detail['validity'] = 0
        user_detail['timestamp'] = today_datetime

        #写入数据库
        today_date = ts2datetime(today_datetime)
        facebook_user_warning_index_name = facebook_user_warning_index_name_pre + today_date

        task_id = xnr_user_no + '_' + user_detail['uid']
        if s_result:
            try:
                es_xnr.index(index=facebook_user_warning_index_name,
                             doc_type=facebook_user_warning_index_type,
                             body=user_detail,
                             id=task_id)
                mark = True
            except:
                mark = False
        else:
            pass

        results.append(mark)

    return results
Пример #14
0
def create_speech_warning(xnr_user_no, today_datetime):
    #查询好友列表
    friends_list = lookup_xnr_friends(xnr_user_no)

    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': {
                            'range': {
                                'sensitive': {
                                    'gte': 1
                                }
                            }
                        }
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        'sort': {
            'sensitive': {
                'order': 'desc'
            }
        }
    }
    facebook_flow_text_index_name = get_timets_set_indexset_list(
        facebook_flow_text_index_name_pre, today_datetime, today_datetime)
    #print facebook_flow_text_index_name
    results = es_xnr.search(index=facebook_flow_text_index_name,
                            doc_type=facebook_flow_text_index_type,
                            body=query_body)['hits']['hits']
    #print results
    result = []
    for item in results:
        if item['_source']['uid'] in friends_list:
            item['_source']['content_type'] = 'friends'
        else:
            item['_source']['content_type'] = 'unfriends'

        item['_source']['validity'] = 0
        item['_source']['xnr_user_no'] = xnr_user_no

        #查询三个指标字段
        fid_result = lookup_fid_attend_index(item['_source']['fid'],
                                             today_datetime)
        if fid_result:
            item['_source']['comment'] = fid_result['comment']
            item['_source']['share'] = fid_result['share']
            item['_source']['favorite'] = fid_result['favorite']
        else:
            item['_source']['comment'] = 0
            item['_source']['share'] = 0
            item['_source']['favorite'] = 0

        #查询用户昵称
        item['_source']['nick_name'] = get_user_nickname(
            item['_source']['uid'])

        task_id = xnr_user_no + '_' + item['_source']['fid']

        #写入数据库
        today_date = ts2datetime(today_datetime)
        facebook_speech_warning_index_name = facebook_speech_warning_index_name_pre + today_date
        #facebook_speech_warning_index_name=facebook_speech_warning_index_name_pre+FACEBOOK_FLOW_START_DATE
        # try:
        es_xnr.index(index=facebook_speech_warning_index_name,
                     doc_type=facebook_speech_warning_index_type,
                     body=item['_source'],
                     id=task_id)
        mark = True
        # except:
        #     mark=False

        result.append(mark)
    return result
Пример #15
0
def cron_compute_mark_qq(current_time):

    current_date = ts2datetime(current_time)
    current_time_new = datetime2ts(current_date)

    xnr_results = es.search(index=qq_xnr_index_name,doc_type=qq_xnr_index_type,\
                body={'query':{'match_all':{}},'size':MAX_SEARCH_SIZE})['hits']['hits']

    if S_TYPE == 'test':
        xnr_results = [{
            '_source': {
                'xnr_user_no': 'QXNR0007',
                'qq_number': '1039598173'
            }
        }]

    for result in xnr_results:
        print 'result....', result
        xnr_user_no = result['_source']['xnr_user_no']
        qq_number = result['_source']['qq_number']
        #xnr_user_no = 'WXNR0004'
        influence_dict = get_influence_at_num(xnr_user_no, qq_number,
                                              current_time)
        penetration_dict = get_penetration_num(xnr_user_no, qq_number,
                                               current_time)
        safe_dict = qq_history_count(xnr_user_no, qq_number, current_time)

        #_id = xnr_user_no + '_' + current_date
        _id = xnr_user_no

        xnr_user_detail = {}
        xnr_user_detail['influence'] = influence_dict['mark']
        xnr_user_detail['penetration'] = penetration_dict['mark']
        xnr_user_detail['safe'] = safe_dict['mark']

        xnr_user_detail['daily_be_at_num'] = influence_dict['daily_be_at_num']
        xnr_user_detail['total_be_at_num'] = influence_dict['total_be_at_num']

        xnr_user_detail['daily_sensitive_num'] = penetration_dict[
            'sensitive_info']
        #xnr_user_detail['daily_sensitive_num'] = penetration_dict['daily_sensitive_num']

        xnr_user_detail['total_post_num'] = safe_dict['total_post_num']
        xnr_user_detail['daily_post_num'] = safe_dict['daily_post_num']

        xnr_user_detail['date_time'] = current_date
        xnr_user_detail['timestamp'] = current_time_new
        xnr_user_detail['xnr_user_no'] = xnr_user_no
        xnr_user_detail['qq_number'] = qq_number

        qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + current_date

        try:
            #print 'xnr_user_detail...',xnr_user_detail
            print 'qq_xnr_history_count_index_name...', qq_xnr_history_count_index_name
            qq_xnr_history_count_mappings(qq_xnr_history_count_index_name)
            es.index(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\
                id=_id,body=xnr_user_detail)

            mark = True

        except:
            mark = False

        return mark
Пример #16
0
def query_related_weibo(ts, origin_mid_list, time_segment):
    query_all_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "root_mid": origin_mid_list
                            }
                        }]
                    }
                }
            }
        },
        "aggs": {
            "all_count": {
                "terms": {
                    "field": "message_type"
                }
            }
        }
    }

    return_results = {"origin": 0, "retweeted": 0, "comment": 0}
    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts - 24 * 3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        results = es_text.search(
            index=index_list,
            doc_type=flow_text_index_type,
            body=query_all_body)['aggregations']['all_count']['buckets']
        if results:
            for item in results:
                if int(item['key']) == 1:
                    return_results['origin'] = item['doc_count']
                elif int(item['key']) == 3:
                    return_results['retweeted'] = item['doc_count']
                elif int(item['key']) == 2:
                    return_results['comment'] = item['doc_count']
                else:
                    pass

    return_results['total_count'] = sum(return_results.values())
    return return_results
Пример #17
0
def compute_recommend_subopnion(task_detail):

    print '开始分析计算......'

    task_id = task_detail['task_id'].strip('"')
    
    keywords_string = task_detail['keywords_string']

    keywords_list = keywords_string.split('&')  ## 以 & 切分关键词,得到list

    xnr_user_no = task_detail['xnr_user_no']
    mid = task_detail['mid']

    query_item = 'keywords_string'
    nest_query_list = []
    for keyword in keywords_list:
        nest_query_list.append({'wildcard':{query_item:'*'+keyword+'*'}})
    
    '''
    ## 重点关注当前虚拟人的关注用户
    if S_TYPE == 'test':
        # followers_list = get_result['followers_list']
        # nest_query_list.append({'terms':followers_list})
        print '全部用户'
    else:
        get_result = es.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
        id=xnr_user_no)['_source']
        followers_list = get_result['followers_list']
        nest_query_list.append({'terms':followers_list})
    '''

    if S_TYPE == 'test':
        create_time = datetime2ts(S_DATE_FB)
    else:
        create_time = datehour2ts(ts2datehour(time.time()-3600))
    
    #fb_get_flow_text_index_list(create_time)
    
    #index_name_list_list = fb_get_flow_text_index_list(now_timestamp)
    index_name_list = fb_get_flow_text_index_list(create_time)
    print 'index_name_list::',index_name_list
    es_results = es.search(index=index_name_list,doc_type='text',\
                    body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits']

    fb_list = [] ## 内容推荐和子观点分析的输入
    
    if es_results:
        for item in es_results:
            item = item['_source']
            fb = item['text']
            fb_list.append(fb)
    
    ## 内容推荐

    ## 得到推荐句子列表
    #print 'fb_list::::::',fb_list
    # print '开始内容推荐计算......'
    # if fb_list:
    #     content_results = summary_main(fb_list)
    # else:
    #     content_results = []

    # print '开始保存内容推荐计算结果......'

    # mark = save_content_recommendation_results(xnr_user_no,mid,task_id.encode('utf-8'),content_results)
    # print 'mark_content:::',mark
    # if mark == False:
    #     print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中'
    #     add_task_2_queue(keyword_task_queue_name,task_detail)
    # else:
    #     print '内容推荐计算结果保存完毕......'
    
    ## 子观点分析
    '''
    输入:
    fb_data:微博列表,[fb1,fb2,...]
    k_cluster:子话题个数 (默认为5)
    输出:
    opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
    word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
    text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    
    print '开始子观点计算......'
    if fb_list:
        opinion_name,word_result,text_list = opinion_main(fb_list,k_cluster=5)
        sub_opinion_results = dict()

        for topic, text in text_list.iteritems():
            
            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]
            
    else:
        sub_opinion_results = {}

    print '开始保存子观点计算结果......'
    mark = save_subopnion_results(xnr_user_no,mid,task_id,sub_opinion_results)
    print 'mark_opinion:::',mark
    if mark == False:

        print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中'

        add_task_2_queue(keyword_task_queue_name,task_detail)

    else:
        print '子观点计算结果保存完毕......'
Пример #18
0
def query_hot_weibo(ts, origin_mid_list, time_segment):
    query_all_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "root_mid": origin_mid_list
                            }
                        }]
                    }
                }
            }
        },
        "aggs": {
            "all_mid": {
                "terms": {
                    "field": "root_mid",
                    "size": 400
                },
                "aggs": {
                    "message_type": {
                        "terms": {
                            "field": "message_type"
                        }
                    }
                }
            }
        }
    }

    return_results = dict()
    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts - 24 * 3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)

    index_list.append(flow_text_index_name_pre +
                      ts2datetime(ts - 2 * 24 * 3600))
    if index_list:
        results = es_text.search(
            index=index_list,
            doc_type=flow_text_index_type,
            body=query_all_body)['aggregations']['all_mid']['buckets']
        if results:
            for item in results:
                temp_dict = dict()
                temp_dict[item['key']] = item['doc_count']
                detail = item['message_type']['buckets']
                detail_dict = dict()
                for iter_item in detail:
                    detail_dict[iter_item['key']] = iter_item['doc_count']
                temp_dict['retweeted'] = detail_dict.get(3, 0)
                temp_dict['comment'] = detail_dict.get(2, 0)
                return_results[item['key']] = temp_dict
        else:
            for item in origin_mid_list:
                temp_dict = dict()
                temp_dict[item] = 0
                temp_dict['retweeted'] = 0
                temp_dict['comment'] = 0
                return_results[item] = temp_dict

    return return_results
Пример #19
0
def get_xnr_trace_community_detail(xnr_user_no, date_time):
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'term': {
                                'xnr_user_no': xnr_user_no
                            }
                        }, {
                            'terms': {
                                'community_status': [0, 1, -2]
                            }
                        }, {
                            'range': {
                                'trace_time': {
                                    'lt': date_time
                                }
                            }
                        }]
                    }
                }
            }
        },
        'size': 7,
        'sort': {
            'trace_time': {
                'order': 'desc'
            }
        }
    }

    trace_index_name = weibo_trace_community_index_name_pre + xnr_user_no.lower(
    )
    trace_community_detail = dict()

    if es_xnr.indices.exists(index=trace_index_name):
        trace_result = es_xnr.search(index=trace_index_name,
                                     doc_type=weibo_trace_community_index_type,
                                     body=query_body)['hits']['hits']
        len_num = len(trace_result)
        total_num = 0
        cluster_sum = 0
        density_sum = 0
        mean_influence_sum = 0
        mean_sensitive_sum = 0
        if len_num > 0:
            for item in trace_result:
                total_num = total_num + item['_source']['num']
                cluster_sum = cluster_sum + item['_source']['cluster']
                density_sum = density_sum + item['_source']['density']
                mean_influence_sum = mean_influence_sum + item['_source'][
                    'mean_influence']
                mean_sensitive_sum = mean_sensitive_sum + item['_source'][
                    'mean_sensitive']

            trace_community_detail['min_num'] = (total_num / len_num) * 0.5
            trace_community_detail['max_num'] = (total_num / len_num) * 1.5
            trace_community_detail['cluster'] = (cluster_sum / len_num) * 0.75
            trace_community_detail['density'] = (density_sum / len_num) * 0.75
            trace_community_detail['mean_influence'] = (mean_influence_sum /
                                                        len_num) * 0.5
            trace_community_detail['mean_sensitive'] = (mean_sensitive_sum /
                                                        len_num) * 0.5
        else:
            trace_community_detail['min_num'] = MIN_COMMUNITY_NUM
            trace_community_detail['max_num'] = MAX_COMMUNITY_NUM
            trace_community_detail['cluster'] = COMMUNITY_DENSITY_CLUSTER
            trace_community_detail['density'] = COMMUNITY_DENSITY_CLUSTER
            trace_community_detail[
                'mean_influence'] = MIN_MEAN_COMMUNITY_INFLUENCE
            trace_community_detail[
                'mean_sensitive'] = MIN_MEAN_COMMUNITY_SENSITIVE
    else:
        trace_community_detail['min_num'] = MIN_COMMUNITY_NUM
        trace_community_detail['max_num'] = MAX_COMMUNITY_NUM
        trace_community_detail['cluster'] = COMMUNITY_DENSITY_CLUSTER
        trace_community_detail['density'] = COMMUNITY_DENSITY_CLUSTER
        trace_community_detail['mean_influence'] = MIN_MEAN_COMMUNITY_INFLUENCE
        trace_community_detail['mean_sensitive'] = MIN_MEAN_COMMUNITY_SENSITIVE

    return trace_community_detail
Пример #20
0
def aggregation_sentiment_related_weibo(ts,
                                        origin_mid_list,
                                        time_segment,
                                        message_type=1,
                                        uid_list=[]):
    if message_type == 1:
        query_all_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [{
                                "range": {
                                    "timestamp": {
                                        "gte": ts - time_segment,
                                        "lt": ts
                                    }
                                }
                            }, {
                                "terms": {
                                    "root_mid": origin_mid_list
                                }
                            }]
                        }
                    }
                }
            },
            "aggs": {
                "all_sentiments": {
                    "terms": {
                        "field": "sentiment"
                    }
                }
            }
        }
    else:
        query_all_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [{
                                "range": {
                                    "timestamp": {
                                        "gte": ts - time_segment,
                                        "lt": ts
                                    }
                                }
                            }, {
                                "terms": {
                                    "root_mid": origin_mid_list
                                }
                            }, {
                                "terms": {
                                    "directed_uid": uid_list
                                }
                            }]
                        }
                    }
                }
            },
            "aggs": {
                "all_sentiments": {
                    "terms": {
                        "field": "sentiment"
                    }
                }
            }
        }

    results = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0}
    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts - 24 * 3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        search_results = es_text.search(
            index=index_list,
            doc_type=flow_text_index_type,
            body=query_all_body)['aggregations']['all_sentiments']['buckets']
        if search_results:
            for item in search_results:
                key = item['key']
                count = item['doc_count']
                results[key] = count
    #print "results: ", results, sum(results.values())
    return results
Пример #21
0
def newest_time_func(uid):

    query_body = {
        'query': {
            'term': {
                'root_uid': uid
            }
        },
        'sort': {
            'timestamp': {
                'order': 'desc'
            }
        }
    }
    try:
        weibo_feedback_retweet_index_name = weibo_feedback_retweet_index_name_pre + '*'
        timestamp_retweet = es.search(index=weibo_feedback_retweet_index_name,doc_type=weibo_feedback_retweet_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    except:
        timestamp_retweet = 0

    try:
        weibo_feedback_like_index_name = weibo_feedback_like_index_name_pre + '*'
        timestamp_like = es.search(index=weibo_feedback_like_index_name,doc_type=weibo_feedback_like_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    except:
        timestamp_like = 0
    #timestamp_follow = es.search(index=weibo_feedback_follow_index_name,doc_type=weibo_feedback_follow_index_type,\
    # body=query_body)['hits']['hits'][0]['_source']['timestamp']
    #timestamp_fans = es.search(index=weibo_feedback_fans_index_name,doc_type=weibo_feedback_fans_index_type,\
    #body=query_body)['hits']['hits'][0]['_source']['timestamp']
    try:

        weibo_feedback_at_index_name = weibo_feedback_at_index_name_pre + '*'

        timestamp_at = es.search(index=weibo_feedback_at_index_name,doc_type=weibo_feedback_at_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    except:
        timestamp_at = 0

    query_body_private = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'root_uid': uid
                    }
                }]
            }
        },
        'sort': {
            'timestamp': {
                'order': 'desc'
            }
        }
    }
    try:
        weibo_feedback_private_index_name = weibo_feedback_private_index_name_pre + '*'

        timestamp_private = es.search(index=weibo_feedback_private_index_name,doc_type=weibo_feedback_private_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    except:
        timestamp_private = 0
    '''
    query_body_private_make = {
        'query':{
            'bool':{
                'must':[
                    {'term':{'root_uid':uid}},
                    {'term':{'private_type':'make'}}
                ]
            }
        },
        'sort':{'timestamp':{'order':'desc'}}
    }

    timestamp_private_make = es.search(index=weibo_feedback_private_index_name,doc_type=weibo_feedback_private_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    '''

    query_body_comment_receive = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'root_uid': uid
                    }
                }, {
                    'term': {
                        'comment_type': 'receive'
                    }
                }]
            }
        },
        'sort': {
            'timestamp': {
                'order': 'desc'
            }
        }
    }
    try:
        weibo_feedback_comment_index_name = weibo_feedback_comment_index_name_pre + '*'

        timestamp_comment_receive = es.search(index=weibo_feedback_comment_index_name,doc_type=weibo_feedback_comment_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    except:
        timestamp_comment_receive = 0

    query_body_comment_make = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'root_uid': uid
                    }
                }, {
                    'term': {
                        'comment_type': 'make'
                    }
                }]
            }
        },
        'sort': {
            'timestamp': {
                'order': 'desc'
            }
        }
    }

    try:
        timestamp_comment_make = es.search(index=weibo_feedback_comment_index_name,doc_type=weibo_feedback_comment_index_type,\
                        body=query_body)['hits']['hits'][0]['_source']['timestamp']
    except:
        timestamp_comment_make = 0
    return timestamp_retweet, timestamp_like, timestamp_at, \
        timestamp_private, timestamp_comment_receive, timestamp_comment_make
Пример #22
0
def social_sensing(task_detail):
    '''
    with open("prediction_uid.pkl", "r") as f:
        uid_model = pickle.load(f)
    with open("prediction_weibo.pkl", "r") as f:
        weibo_model = pickle.load(f)
    '''
    # 任务名 传感器 终止时间 之前状态 创建者 时间

    task_name = task_detail[0]
    social_sensors = task_detail[1]
    #ts = int(task_detail[2])
    ts = float(task_detail[2])

    xnr_user_no = task_detail[3]

    print ts2date(ts)
    index_list = []
    important_words = []
    datetime_1 = ts2datetime(ts)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es = es_text.indices.exists(index=index_name_1)
    if exist_es:
        index_list.append(index_name_1)
    datetime_2 = ts2datetime(ts - DAY)
    index_name_2 = flow_text_index_name_pre + datetime_2
    exist_es = es_text.indices.exists(index=index_name_2)
    if exist_es:
        index_list.append(index_name_2)
    if es_text.indices.exists(index=flow_text_index_name_pre +
                              ts2datetime(ts - 2 * DAY)):
        index_list.append(flow_text_index_name_pre + ts2datetime(ts - 2 * DAY))

    # PART 1

    #forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list, forward_1 = query_mid_list(
        ts - time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list, forward_3 = query_mid_list(
        ts - time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list, current_1 = query_mid_list(ts, social_sensors,
                                                 time_interval)
    current_retweeted_mid_list, current_3 = query_mid_list(
        ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_origin_list = list(set(all_origin_list))
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(
        forward_retweeted_weibo_list)  #被转发微博的mid/root-mid
    all_retweeted_list = list(set(all_retweeted_list))

    all_mid_list = filter_mid(all_mid_list)
    all_origin_list = filter_mid(all_origin_list)
    all_retweeted_list = filter_mid(all_retweeted_list)

    print "all mid list: ", len(all_mid_list)
    print "all_origin_list", len(all_origin_list)
    print "all_retweeted_list", len(all_retweeted_list)

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    #statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
        origin_weibo_detail = dict()
        for mid in all_origin_list:
            retweet_count = es_text.count(
                index=index_list,
                doc_type="text",
                body={"query": {
                    "bool": {
                        "must": [{
                            "term": {
                                "fid": mid
                            }
                        }]
                    }
                }})["count"]
            comment_count = es_text.count(
                index=index_list,
                doc_type="text",
                body={"query": {
                    "bool": {
                        "must": [{
                            "term": {
                                "fid": mid
                            }
                        }]
                    }
                }})["count"]
            tmp = dict()
            tmp["retweeted"] = retweet_count
            tmp["comment"] = comment_count
            origin_weibo_detail[mid] = tmp
    else:
        origin_weibo_detail = {}
    print "len(origin_weibo_detail): ", len(origin_weibo_detail)
    if all_retweeted_list:
        retweeted_weibo_detail = dict()
        for mid in all_retweeted_list:
            retweet_count = es_text.count(index=index_list,
                                          doc_type="text",
                                          body={
                                              "query": {
                                                  "bool": {
                                                      "must": [{
                                                          "term": {
                                                              "root_mid": mid
                                                          }
                                                      }, {
                                                          "term": {
                                                              "message_type": 3
                                                          }
                                                      }]
                                                  }
                                              }
                                          })["count"]
            comment_count = es_text.count(index=index_list,
                                          doc_type="text",
                                          body={
                                              "query": {
                                                  "bool": {
                                                      "must": [{
                                                          "term": {
                                                              "root_mid": mid
                                                          }
                                                      }, {
                                                          "term": {
                                                              "message_type": 2
                                                          }
                                                      }]
                                                  }
                                              }
                                          })["count"]
            tmp = dict()
            tmp["retweeted"] = retweet_count
            tmp["comment"] = comment_count
            retweeted_weibo_detail[mid] = tmp
        #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail)
    #current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    #current_retweeted_count = statistics_count['retweeted']
    #current_comment_count = statistics_count['comment']

    #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100]))

    # 感知到的事, all_mid_list
    sensitive_text_list = []
    tmp_sensitive_warning = ""
    text_dict = dict()  # 文本信息
    mid_value = dict()  # 文本赋值
    duplicate_dict = dict()  # 重合字典
    portrait_dict = dict()  # 背景信息
    classify_text_dict = dict()  # 分类文本
    classify_uid_list = []
    duplicate_text_list = []
    sensitive_words_dict = dict()
    sensitive_weibo_detail = {}
    trendline_dict = dict()
    all_text_dict = dict()

    # 有事件发生时开始
    if 1:
        print "index_list:", index_list

        if index_list and all_mid_list:
            query_body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "terms": {
                                "mid": all_mid_list
                            }
                        }
                    }
                },
                "size": 5000
            }
            search_results = es_text.search(index=index_list,
                                            doc_type="text",
                                            body=query_body)['hits']['hits']
            print "search mid len: ", len(search_results)
            tmp_sensitive_warning = ""
            text_dict = dict()  # 文本信息
            mid_value = dict()  # 文本赋值
            duplicate_dict = dict()  # 重合字典
            portrait_dict = dict()  # 背景信息
            classify_text_dict = dict()  # 分类文本
            #classify_uid_list = []
            classify_mid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            mid_ts_dict = dict()  # 文本发布时间
            uid_prediction_dict = dict()
            weibo_prediction_dict = dict()
            trendline_dict = dict()
            feature_prediction_list = []  # feature
            mid_prediction_list = []  # dui ying mid
            if search_results:
                for item in search_results:
                    iter_uid = item['_source']['uid']
                    iter_mid = item['_source']['mid']
                    mid_ts_dict[iter_mid] = item["_source"]["timestamp"]
                    iter_text = item['_source']['text'].encode(
                        'utf-8', 'ignore')
                    iter_sensitive = item['_source'].get('sensitive', 0)
                    tmp_text = get_weibo(item['_source'])
                    all_text_dict[iter_mid] = tmp_text

                    duplicate_text_list.append({
                        "_id":
                        iter_mid,
                        "title":
                        "",
                        "content":
                        iter_text.decode("utf-8", 'ignore')
                    })

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation  #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(
                        item['_source']['keywords_dict'])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode('utf-8', 'ignore')
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    #classify_uid_list.append(iter_uid)
                    classify_mid_list.append(iter_mid)

                # 去重
                print "start duplicate"
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item['duplicate']:
                            duplicate_dict[item['_id']] = item['same_from']

                # 分类
                print "start classify"
                mid_value = dict()
                if classify_text_dict:
                    #classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                    classify_results = topic_classfiy(classify_mid_list,
                                                      classify_text_dict)

                    #print "classify_results: ", classify_results

                    for k, v in classify_results.iteritems():  # mid:value
                        #mid_value[k] = topic_value_dict[v[0]]
                        mid_value[k] = v[0]
                        #feature_list = organize_feature(k, mid_ts_dict[k])
                        #feature_prediction_list.append(feature_list) # feature list
                        #mid_prediction_list.append(k) # corresponding

                # prediction
                """
                print "start prediction"
                weibo_prediction_result = weibo_model.predict(feature_prediction_list)
                uid_prediction_result = uid_model.predict(feature_prediction_list)
                for i in range(len(mid_prediction_list)):
                    if  i % 100 == 0:
                        print i
                    uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i]
                    weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i]
                    tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]])
                    trendline_dict[mid_prediction_list[i]] = tmp_trendline
                """
    # organize data

    mid_list = all_text_dict.keys()
    print "final mid:", len(mid_list)
    print "intersection: ", len(set(mid_list) & set(all_mid_list))
    bulk_action = []
    count = 0
    for mid in mid_list:
        iter_dict = dict()
        if origin_weibo_detail.has_key(mid):
            iter_dict.update(origin_weibo_detail[mid])
            iter_dict["type"] = 1
        elif retweeted_weibo_detail.has_key(mid):
            iter_dict.update(retweeted_weibo_detail[mid])
            iter_dict["type"] = 3
        else:
            iter_dict["retweeted"] = 0
            iter_dict["comment"] = 0
            print "mid in all_mid_list: ", mid in set(all_mid_list)

        #iter_dict["trendline"] = json.dumps(trendline_dict[mid])
        if duplicate_dict.has_key(mid):
            iter_dict["duplicate"] = duplicate_dict[mid]
        else:
            iter_dict["duplicate"] = ""

        #iter_dict["uid_prediction"] = uid_prediction_dict[mid]
        #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid]
        iter_dict["compute_status"] = 0  # 尚未计算
        iter_dict["topic_field"] = mid_value[mid]
        iter_dict["detect_ts"] = ts
        iter_dict["xnr_user_no"] = xnr_user_no

        iter_dict.update(all_text_dict[mid])
        count += 1
        print 'iter_dict:::', iter_dict
        _id = xnr_user_no + '_' + mid
        bulk_action.extend([{"index": {"_id": _id}}, iter_dict])
        if count % 500 == 0:
            es_xnr.bulk(bulk_action,
                        index="social_sensing_text",
                        doc_type="text",
                        timeout=600)
            bulk_action = []

    if bulk_action:
        es_xnr.bulk(bulk_action,
                    index="social_sensing_text",
                    doc_type="text",
                    timeout=600)

    return "1"
Пример #23
0
def get_un_trace_follow_operate(xnr_user_no, uid_string, nick_name_string):

    mark = False
    fail_nick_name_list = []
    fail_uids = []

    if uid_string:
        uid_list = uid_string.encode('utf-8').split(',')

    elif nick_name_string:
        nick_name_list = nick_name_string.encode('utf-8').split(',')
        uid_list = []

        for nick_name in nick_name_list:
            query_body = {
                'query': {
                    'filtered': {
                        'filter': {
                            'term': {
                                'nick_name': nick_name
                            }
                        }
                    }
                },
                '_source': ['uid']
            }
            try:
                uid_results = es.search(index=facebook_user_index_name,doc_type=facebook_user_index_type,\
                            body=query_body)['hits']['hits']

                uid_result = uid_result[0]['_source']
                uid = uid_result['uid']
                uid_list.append(uid)

            except:
                fail_nick_name_list.append(nick_name)

    try:
        result = es.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                            id=xnr_user_no)['_source']

        trace_follow_list = result['trace_follow_list']

        # 共同uids
        comment_uids = list(set(trace_follow_list).intersection(set(uid_list)))

        # 取消失败uid
        fail_uids = list(set(comment_uids).difference(set(uid_list)))

        # 求差
        trace_follow_list = list(
            set(trace_follow_list).difference(set(uid_list)))


        es.update(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                            id=xnr_user_no,body={'doc':{'trace_follow_list':trace_follow_list}})

        mark = True
    except:
        mark = False

    return [mark, fail_uids, fail_nick_name_list]
Пример #24
0
def getgroup_v2(qq_xnr):
    group_dict = {}
    #step0: get qqbot_port
    if qq_xnr[:4] != 'QXNR':

        search_result = es.search(index=qq_xnr_index_name,doc_type=qq_xnr_index_type,\
            body={'query':{'term':{'qq_number':qq_xnr}}})['hits']['hits']

        qq_xnr = search_result[0]['_id']

    #try:
    qq_xnr_es_result = es.get(index=qq_xnr_index_name,\
            doc_type=qq_xnr_index_type, id=qq_xnr, _source=True)['_source']

    group_info = json.loads(qq_xnr_es_result['group_info'])

    qqbot_port = qq_xnr_es_result['qqbot_port']
    print 'qqbot_port..', qqbot_port
    # p_str = 'qq '+str(qqbot_port) + ' list buddy'
    # p_str = "qq "+str(qqbot_port) + " .List('buddy')"
    p_str = "qq " + str(qqbot_port) + " list buddy"
    print("111111111111111111111", p_str)
    p = subprocess.Popen(p_str, shell=True, \
            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    line_count = 0
    print 'print p ======================='
    print p
    for line in p.stdout.readlines():
        line_count += 1
        print 'line.==========', line
        if line_count >= 5 and line_count % 2 == 1:
            item_line_list = line.split('|')

            try:
                #qq_group_number = str(int(item_line_list[2]))
                qq_uin_number = str(int(item_line_list[7]))
                #print 'qq_uin_number..',qq_uin_number
                qq_group_name = item_line_list[4]
                qq_mark_name = item_line_list[5]
                # group_dict[qq_group_number] = qq_group_name
                group_dict[qq_uin_number] = qq_group_name

                # 如果uin为空,则添加进去uin,如果不为空,则更新群名(因为群名可能修改)
                for key, value_dict in group_info.iteritems():

                    mark_name = value_dict['mark_name']

                    if not qq_mark_name:
                        if qq_mark_name == mark_name:
                            if not qq_group_name in value_dict['group_name']:
                                group_info[key]['group_name'].append(
                                    qq_group_name)

            except:
                next

    group_info = json.dumps(group_info)
    es.update(index=qq_xnr_index_name,
              doc_type=qq_xnr_index_type,
              id=qq_xnr,
              body={'doc': {
                  'group_info': group_info
              }})

    print 'group_dict::len..', len(group_dict)

    return group_dict
Пример #25
0
def match_flow_text():

    current_time = int(time.time())
    current_date = ts2datetime(current_time)

    new_xnr_flow_text_index_name = new_xnr_flow_text_index_name_pre + current_date

    new_weibo_xnr_flow_text_mappings(new_xnr_flow_text_index_name)

    #xnr_flow_text_index_name = xnr_flow_text_index_name_pre + current_date
    flow_text_index_name = flow_text_index_name_pre + current_date

    query_body = {'query': {'term': {'create_status': 2}}, 'size': MAX_VALUE}

    try:
        search_results = es_xnr.search(index=weibo_xnr_index_name,doc_type=weibo_xnr_index_type,\
              body=query_body)['hits']['hits']
        #print 'search_results...',search_results
        bulk_action = []
        count = 0

        for result in search_results:
            result = result['_source']
            uid = result['uid']
            xnr_user_no = result['xnr_user_no']

            match_query_body = {
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'uid': uid
                            }
                        }]
                    }
                },
                'size': MAX_VALUE
            }

            match_results = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\
               body=match_query_body)['hits']['hits']
            print 'match_results..', match_results
            for match_item in match_results:

                match_item = match_item['_source']

                keyword_dict = match_item['keywords_dict']
                mid = match_item['mid']

                keywords_dict = json.loads(keyword_dict)
                personal_keywords_dict = dict()
                classify_text_dict = dict()  # 分类文本
                mid_value = dict()
                for k, v in keywords_dict.iteritems():
                    k = k.encode('utf-8', 'ignore')
                    personal_keywords_dict[k] = v
                classify_text_dict[mid] = personal_keywords_dict

                if classify_text_dict:
                    classify_results = topic_classfiy([mid],
                                                      classify_text_dict)

                for k, v in classify_results.iteritems():  # mid:value

                    mid_value[k] = v

                match_item["topic_field_first"] = topic_en2ch_dict[
                    mid_value[mid][0]]
                match_item["topic_field"] = '&'.join(mid_value[mid])
                match_item['xnr_user_no'] = xnr_user_no

                action = {'index': {'_id': mid}}
                source = match_item
                bulk_action.extend([action, source])

                count += 1
                if count % 1000 == 0:
                    #print 'bulk_action..',bulk_action
                    es_xnr.bulk(bulk_action,
                                index=new_xnr_flow_text_index_name,
                                doc_type=new_xnr_flow_text_index_type,
                                timeout=600)

            if bulk_action:
                #print 'bulk_action..',bulk_action
                es_xnr.bulk(bulk_action,
                            index=new_xnr_flow_text_index_name,
                            doc_type=new_xnr_flow_text_index_type,
                            timeout=600)

    except:
        return 'no tweets to update today'
Пример #26
0
def qq_history_count(xnr_user_no, qq_number, current_time):

    if S_TYPE == 'test':
        current_time = datetime2ts(QQ_S_DATE_ASSESSMENT)

    current_date = ts2datetime(current_time)
    last_date = ts2datetime(current_time - DAY)

    group_message_index_name = group_message_index_name_pre + current_date
    qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + last_date

    # 得到当天发帖数量
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'speaker_qq_number': qq_number
                    }
                }, {
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }]
            }
        }
    }

    count_result = es.count(index=group_message_index_name,
                            doc_type=group_message_index_type,
                            body=query_body)

    if count_result['_shards']['successful'] != 0:
        today_count = count_result['count']
    else:
        print 'es index rank error'
        today_count = 0

    # 得到历史发言总数
    try:
        get_result = es.get(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\
                            id=xnr_user_no)['_source']

        total_count_history = get_result['total_post_num']

    except:
        total_count_history = 0

    total_count_totay = total_count_history + today_count

    item_dict = dict()
    item_dict['total_post_num'] = total_count_totay
    item_dict['daily_post_num'] = today_count

    # xnr所在群当天发言最多的人
    query_body_total_day = {
        'query': {
            'filtered': {
                'filter': {
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }
            }
        },
        'aggs': {
            'all_speakers': {
                'terms': {
                    'field': 'speaker_qq_number',
                    "order": {
                        "_count": "desc"
                    }
                }
            }
        }
    }

    try:

        results_total_day = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\
                    body=query_body_total_day)['aggregations']['all_speakers']['buckets']

        speaker_max = results_total_day[0]['doc_count']
    except:
        speaker_max = today_count

    safe = (float(math.log(today_count + 1)) /
            (math.log(speaker_max + 1) + 1)) * 100

    safe = round(safe, 2)  # 保留两位小数

    item_dict['mark'] = safe

    return item_dict
Пример #27
0
def publish_operate_timing():

    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'term': {
                        'task_status': 0
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE
    }

    results = es_xnr.search(index=weibo_xnr_timing_list_index_name,doc_type=\
                    weibo_xnr_timing_list_index_type,body=query_body)['hits']['hits']
    print 'results::', results
    if results:
        for result in results:
            _id = result['_id']
            result = result['_source']
            timestamp_set = result['post_time']
            print timestamp_set
            if timestamp_set <= int(time.time()):
                print '!!'
                text = result['text'].encode('utf-8')
                tweet_type = task_source_ch2en[result['task_source']]
                xnr_user_no = result['xnr_user_no']

                try:
                    p_url = result['p_url']
                except:
                    p_url = ''
                try:
                    rank = result['rank']
                except:
                    rank = u'0'
                try:
                    rankid = result['rankid']
                except:
                    rankid = ''
                #r_mid = result['mid']

                es_get_result = es_xnr.get(index=weibo_xnr_index_name,
                                           doc_type=weibo_xnr_index_type,
                                           id=xnr_user_no)['_source']

                weibo_mail_account = es_get_result['weibo_mail_account']
                weibo_phone_account = es_get_result['weibo_phone_account']
                password = es_get_result['password']

                if weibo_mail_account:
                    account_name = weibo_mail_account
                elif weibo_phone_account:
                    account_name = weibo_phone_account
                else:
                    return False

                mark = publish_tweet_func(account_name, password, text, p_url,
                                          rank, rankid, tweet_type,
                                          xnr_user_no)

                if mark[0]:
                    #task_id = xnr_user_no + '_' + r_mid
                    task_id = _id
                    # item_exist = es_xnr.get(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\
                    #        weibo_xnr_retweet_timing_list_index_type,id=task_id)['_source']
                    item_exist = {}
                    item_exist['task_status'] = 1
                    #item_exist['timstamp_post'] = int(time.time())

                    es_xnr.update(index=weibo_xnr_timing_list_index_name,doc_type=\
                        weibo_xnr_timing_list_index_type,id=task_id,body={'doc':item_exist})

                    # # 保存微博
                    # try:
                    #     save_mark = save_to_xnr_flow_text(tweet_type,xnr_user_no,text)
                    # except:
                    #     print '保存微博过程遇到错误!'
                    #     save_mark = False
            else:
                continue
Пример #28
0
def get_penetration_num(xnr_user_no, qq_number, current_time):

    follow_group_sensitive = {}
    follow_group_sensitive['sensitive_info'] = {}

    get_result = es_xnr.get(index=qq_xnr_index_name,
                            doc_type=qq_xnr_index_type,
                            id=xnr_user_no)['_source']

    #group_list = get_result['qq_groups']
    group_list = []
    group_info = json.loads(get_result['group_info'])

    for key, value_dict in group_info.iteritems():
        group_name = value_dict['group_name']
        group_list.extend(group_name)

    if S_TYPE == 'test':
        current_time = datetime2ts(QQ_S_DATE_ASSESSMENT)

    current_date = ts2datetime(current_time)

    group_message_index_name = group_message_index_name_pre + current_date

    query_body_info = {
        'query': {
            'filtered': {
                'filter': {
                    'terms': {
                        'qq_group_nickname': group_list
                    }
                }
            }
        },
        'aggs': {
            'avg_sensitive': {
                'avg': {
                    'field': 'sensitive_value'
                }
            }
        }
    }
    try:
        es_sensitive_result = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\
            body=query_body_info)['aggregations']
        sensitive_value = es_sensitive_result['avg_sensitive']['value']

        if sensitive_value == None:
            sensitive_value = 0.0
        follow_group_sensitive['sensitive_info'] = round(sensitive_value, 2)
    except:
        follow_group_sensitive['sensitive_info'] = 0

    #if i == (WEEK-1):
    query_body_max = {
        'query': {
            'filtered': {
                'filter': {
                    'terms': {
                        'qq_group_nickname': group_list
                    }
                }
            }
        },
        'sort': {
            'sensitive_value': {
                'order': 'desc'
            }
        }
    }
    try:
        max_results = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\
                        body=query_body_max)['hits']['hits']

        max_sensitive = max_results[0]['_source']['sensitive_value']
    except:
        max_sensitive = 0

    penetration = (math.log(sensitive_value + 1) /
                   (math.log(max_sensitive + 1) + 1)) * 100
    penetration = round(penetration, 2)

    follow_group_sensitive['mark'] = penetration

    return follow_group_sensitive
Пример #29
0
def retweet_operate_timing():

    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'term': {
                        'compute_status': 0
                    }
                }
            }
        }
    }

    results = es_xnr.search(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\
                    weibo_xnr_retweet_timing_list_index_type,body=query_body)['hits']['hits']
    if results:
        for result in results:
            result = result['_source']
            timestamp_set = result['timestamp_set']

            if timestamp_set <= int(time.time()):

                text = result['text'].encode('utf-8')
                tweet_type = 'trace_follow_tweet'
                xnr_user_no = result['xnr_user_no']
                r_mid = result['mid']

                es_get_result = es_xnr.get(index=weibo_xnr_index_name,
                                           doc_type=weibo_xnr_index_type,
                                           id=xnr_user_no)['_source']

                weibo_mail_account = es_get_result['weibo_mail_account']
                weibo_phone_account = es_get_result['weibo_phone_account']
                password = es_get_result['password']

                if weibo_mail_account:
                    account_name = weibo_mail_account
                elif weibo_phone_account:
                    account_name = weibo_phone_account
                else:
                    return False
                print 'text::', text
                print 'r_mid:::', r_mid
                text = ''  # 空转发
                mark = retweet_tweet_func(account_name, password, text, r_mid,
                                          tweet_type, xnr_user_no)
                print 'mark::', mark[0]
                if mark[0]:
                    task_id = xnr_user_no + '_' + r_mid
                    # item_exist = es_xnr.get(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\
                    #        weibo_xnr_retweet_timing_list_index_type,id=task_id)['_source']
                    item_exist = {}
                    item_exist['compute_status'] = 1
                    #item_exist['timstamp_post'] = int(time.time())

                    es_xnr.update(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\
                        weibo_xnr_retweet_timing_list_index_type,id=task_id,body={'doc':item_exist})

                    # # 保存微博
                    # try:
                    #     save_mark = save_to_xnr_flow_text(tweet_type,xnr_user_no,text)
                    # except:
                    #     print '保存微博过程遇到错误!'
                    #     save_mark = False
            else:
                continue
Пример #30
0
def detect_by_keywords(keywords, datetime_list):
    keywords_list = []
    model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH,
                                                            binary=True)
    for word in keywords:
        simi_list = model.most_similar(word, topn=20)
        for simi_word in simi_list:
            keywords_list.append(simi_word[0])

    group_uid_list = set()
    if datetime_list == []:
        return []

    query_item = 'text'
    flow_text_index_name_list = []
    for datetime in datetime_list:
        flow_text_index_name = facebook_flow_text_index_name_pre + datetime
        flow_text_index_name_list.append(flow_text_index_name)

    nest_query_list = []
    #文本中可能存在英文或者繁体字,所以都匹配一下
    en_keywords_list = trans(keywords_list, target_language='en')
    for i in range(len(keywords_list)):
        keyword = keywords_list[i]
        traditional_keyword = simplified2traditional(keyword)

        if len(en_keywords_list) == len(keywords_list):  #确保翻译没出错
            en_keyword = en_keywords_list[i]
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + en_keyword + '*'
                }})

        nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}})
        nest_query_list.append(
            {'wildcard': {
                query_item: '*' + traditional_keyword + '*'
            }})

    count = MAX_DETECT_COUNT
    if len(nest_query_list) == 1:
        SHOULD_PERCENT = 1  # 绝对数量。 保证至少匹配一个词
    else:
        SHOULD_PERCENT = '3'  # 相对数量。 2个词时,保证匹配2个词,3个词时,保证匹配2个词

    query_body = {
        'query': {
            'bool': {
                'should': nest_query_list,
                'minimum_should_match': SHOULD_PERCENT,
                # 'must_not':{'terms':{'uid':white_uid_list}}
            }
        },
        'aggs': {
            'all_uids': {
                'terms': {
                    'field': 'uid',
                    'order': {
                        '_count': 'desc'
                    },
                    'size': count
                }
            }
        }
    }
    es_results = es_xnr.search(index=flow_text_index_name_list,doc_type=facebook_flow_text_index_type,\
                body=query_body,request_timeout=999999)['aggregations']['all_uids']['buckets']

    for i in range(len(es_results)):
        uid = es_results[i]['key']
        group_uid_list.add(uid)
    group_uid_list = list(group_uid_list)
    return group_uid_list