Exemplo n.º 1
0
def update_keywords(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(load_timestamp())
    keywords_query_body = {
        'query':{
            "filtered":{
                "filter": {
                    "bool": {
                        "must": [
                            {"terms": {"uid": uid_list}},
                        ]
                     }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {"timestamp": {"order": "desc"}},
        "fields": ["keywords_dict", "uid"]
    }
    user_keywords = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(index=index_name, doc_type=facebook_flow_text_index_type, body=keywords_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_keywords:
                    user_keywords[uid] = {
                        'keywords': {}
                    }
                if content.has_key('keywords_dict'):
                    user_keywords[uid]['keywords'] = merge_dict(user_keywords[uid]['keywords'], json.loads(content['keywords_dict'][0]))
        except Exception,e:
            print e
Exemplo n.º 2
0
def get_tweets_from_user_portrait(monitor_keywords_list, sort_item_new):

    query_body = {
        'query': {
            'match_all': {}
        },
        'sort': {
            sort_item_new: {
                'order': 'desc'
            }
        },
        'size': USER_POETRAIT_NUMBER
    }
    #print 'query_body:::',query_body
    es_results_portrait = es_fb_user_portrait.search(
        index=fb_portrait_index_name,
        doc_type=fb_portrait_index_type,
        body=query_body)['hits']['hits']

    uid_set = set()

    if es_results_portrait:
        for result in es_results_portrait:
            uid = result['_id']
            # result = result['_source']
            # #print 'result....',result.keys()
            # uid = result['uid']
            uid_set.add(uid)
    uid_list = list(uid_set)

    es_results = uid_lists2fb_from_flow_text(monitor_keywords_list, uid_list)

    return es_results
Exemplo n.º 3
0
def update_influence(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_bci_index_list = get_fb_bci_index_list(load_timestamp())
    fb_influence_query_body = {
        'query':{
            "filtered":{
                "filter": {
                    "bool": {
                        "must": [
                            {"terms": {"uid": uid_list}},
                        ]
                     }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {"timestamp": {"order": "desc"}},
        "fields": ["influence", "uid"]
    }
    user_influence = {}
    for index_name in fb_bci_index_list:
        try:
            search_results = es.search(index=index_name, doc_type=fb_bci_index_type, body=fb_influence_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_influence:
                    user_influence[uid] = {
                        'influence_list': []
                    }
                if content.has_key('influence'):
                    user_influence[uid]['influence_list'].append(float(content.get('influence')[0]))
        except Exception,e:
            print e
Exemplo n.º 4
0
def update_baseinfo(uid_list=[]):
    user_baseinfo = {}
    fb_user_query_body = {
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "fields": ["location", "gender", "name", "uid"]
    }
    search_results = es.search(index=facebook_user_index_name,
                               doc_type=facebook_user_index_type,
                               body=fb_user_query_body)['hits']['hits']
    for item in search_results:
        content = item['fields']
        uid = content['uid'][0]
        if not uid in user_baseinfo:
            user_baseinfo[uid] = {
                'uid': str(uid),
                'uname': '',
                'gender': 0,
                'location': '',
            }
        location = ''
        if content.has_key('location'):
            location_dict = json.loads(content.get('location')[0])
            location = get_user_location(location_dict)
        gender = 0
        if content.has_key('gender'):
            gender_str = content.get('gender')[0]
            if gender_str == 'male':
                gender = 1
            elif gender_str == 'female':
                gender = 2
        uname = ''
        if content.has_key('name'):
            uname = content.get('name')[0]
        user_baseinfo[uid]['location'] = location
        user_baseinfo[uid]['gender'] = gender
        user_baseinfo[uid]['uname'] = uname
    for uid in uid_list:
        if not uid in user_baseinfo:
            user_baseinfo[uid] = {
                'uid': str(uid),
                'uname': '',
                'gender': 0,
                'location': '',
            }
    return save_data2es(user_baseinfo)
Exemplo n.º 5
0
def load_uid_list():
    uid_list = []
    uid_list_query_body = {'size': MAX_SEARCH_SIZE}
    try:
        search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=uid_list_query_body)['hits']['hits']
        for item in search_results:
            uid_list.append(item['_source']['uid'])
    except Exception,e:
        print e
Exemplo n.º 6
0
def update_sentiment(uid_list=[]):
    '''
    SENTIMENT_DICT_NEW = {'0':u'中性', '1':u'积极', '2':u'生气', '3':'焦虑', \
         '4':u'悲伤', '5':u'厌恶', '6':u'消极其他', '7':u'消极'}
    '''
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(
        load_timestamp())
    sentiment_query_body = {
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "fields": ["sentiment", "uid"]
    }
    user_sentiment = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(
                index=index_name,
                doc_type=facebook_flow_text_index_type,
                body=sentiment_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_sentiment:
                    user_sentiment[uid] = {'sentiment_list': []}
                if content.has_key('sentiment'):
                    user_sentiment[uid]['sentiment_list'].append(
                        int(content.get('sentiment')[0]))
        except Exception, e:
            print e
Exemplo n.º 7
0
def load_fb_flow_text(fb_flow_text_index_list,
                      uid_list,
                      fb_flow_text_query_body={}):
    if not fb_flow_text_query_body:
        fb_flow_text_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": uid_list
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "sort": {
                "timestamp": {
                    "order": "desc"
                }
            },
            "fields": ["text", "uid"]
        }
    fb_flow_text = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(
                index=index_name,
                doc_type=facebook_flow_text_index_type,
                body=fb_flow_text_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in fb_flow_text:
                    fb_flow_text[uid] = {'text_dict': {}}
                if content.has_key('text'):
                    fb_flow_text[uid]['text_dict'][
                        item['_id']] = traditional2simplified(
                            content['text'][0]
                            [:1800])  #对文本内容长度做出限制[:1800],以免翻译时麻烦
                else:
                    fb_flow_text[uid]['text_dict'][item['_id']] = ''
        except Exception, e:
            print e
Exemplo n.º 8
0
def update_hashtag(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(
        load_timestamp())
    keywords_query_body = {
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "fields": ["hashtag", "uid"]
    }
    user_hashtag = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(
                index=index_name,
                doc_type=facebook_flow_text_index_type,
                body=keywords_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_hashtag:
                    user_hashtag[uid] = {'hashtag_list': []}
                if content.has_key('hashtag'):
                    hashtag = content['hashtag'][0]
                    if hashtag:
                        hashtag_list = hashtag.split('&')
                        user_hashtag[uid]['hashtag_list'].extend(hashtag_list)
        except Exception, e:
            print e
Exemplo n.º 9
0
def update_domain(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(load_timestamp())
    user_domain_data = {}
    #load num of text
    count_result = count_text_num(uid_list, fb_flow_text_index_list)
    #load baseinfo
    fb_user_query_body = {
        'query':{
            "filtered":{
                "filter": {
                    "bool": {
                        "must": [
                            {"terms": {"uid": uid_list}},
                        ]
                     }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "fields": ["bio", "about", "description", "quotes", "category", "uid"]
    }
    try:
        search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=fb_user_query_body)['hits']['hits']
        for item in search_results:
            content = item['fields']
            uid = content['uid'][0]
            if not uid in user_domain_data:
                text_num = count_result[uid]
                user_domain_data[uid] = {
                    'bio_str': '',
                    'bio_list': [],
                    'category': '',
                    'number_of_text': text_num
                }
            #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
            if content.has_key('category'):
                category = content.get('category')[0]
            else:
                category = ''
            if content.has_key('description'):
                description = content.get('description')[0][:1000]  #有的用户描述信息之类的太长了……3000+,没有卵用,而且翻译起来会出现一些问题,截取一部分就行了
            else:
                description = ''
            if content.has_key('quotes'):
                quotes = content.get('quotes')[0][:1000]
            else:
                quotes = ''
            if content.has_key('bio'):
                bio = content.get('bio')[0][:1000]
            else:
                bio = ''
            if content.has_key('about'):
                about = content.get('about')[0][:1000]
            else:
                about = ''    
            user_domain_data[uid]['bio_list'] = [quotes, bio, about, description]
            user_domain_data[uid]['category'] = category
    except Exception,e:
        print e