Пример #1
0
def update_topic(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(load_timestamp())
    user_topic_data = get_filter_keywords(fb_flow_text_index_list, uid_list)
    user_topic_dict, user_topic_list = topic_classfiy(uid_list, user_topic_data)
    
    user_topic_string = {}
    for uid, topic_list in user_topic_list.items():
        li = []
        for t in topic_list:
            li.append(zh_data[name_list.index(t)].decode('utf8'))
        user_topic_string[uid] = '&'.join(li)
    user_topic = {}
    for uid in uid_list:
        if uid in user_topic_dict:
            user_topic[uid] = {
                'filter_keywords': json.dumps(user_topic_data[uid]),
                'topic': json.dumps(user_topic_dict[uid]),
                'topic_string': user_topic_string[uid]
            }
        else:
            user_topic[uid] = {
                'filter_keywords': json.dumps({}),
                'topic': json.dumps({}),
                'topic_string': ''
            }
    return save_data2es(user_topic)
Пример #2
0
def update_keywords(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(load_timestamp())
    keywords_query_body = {
        'query':{
            "filtered":{
                "filter": {
                    "bool": {
                        "must": [
                            {"terms": {"uid": uid_list}},
                        ]
                     }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {"timestamp": {"order": "desc"}},
        "fields": ["keywords_dict", "uid"]
    }
    user_keywords = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(index=index_name, doc_type=facebook_flow_text_index_type, body=keywords_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_keywords:
                    user_keywords[uid] = {
                        'keywords': {}
                    }
                if content.has_key('keywords_dict'):
                    user_keywords[uid]['keywords'] = merge_dict(user_keywords[uid]['keywords'], json.loads(content['keywords_dict'][0]))
        except Exception,e:
            print e
Пример #3
0
def update_domain(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(
        load_timestamp(), TEST_MAX_FLOW_TEXT_DAYS)
    user_domain_data = {}
    #load num of text
    count_result = count_text_num(uid_list, fb_flow_text_index_list)
    #load baseinfo
    fb_user_query_body = {
        'post_filter': {
            'exists': {
                'field': 'bio_str'
            }
        },
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "fields": ["bio_str", "category", "uid"]
    }
    try:
        search_results = es.search(index=facebook_user_index_name,
                                   doc_type=facebook_user_index_type,
                                   body=fb_user_query_body)['hits']['hits']
        for item in search_results:
            content = item['fields']
            uid = content['uid'][0]
            if not uid in user_domain_data:
                text_num = count_result[uid]
                user_domain_data[uid] = {
                    'bio_str': '',
                    'category': '',
                    'number_of_text': text_num
                }
            #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
            if content.has_key('category'):
                category = content.get('category')[0]
            else:
                category = ''
            if content.has_key('bio_str'):
                bio_str = content.get('bio_str')[0]
            else:
                bio_str = ''
            user_domain_data[uid]['bio_str'] = bio_str
            user_domain_data[uid]['category'] = category
    except Exception, e:
        print e
Пример #4
0
def update_sensitive(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(
        load_timestamp(), TEST_MAX_FLOW_TEXT_DAYS)
    sensitive_query_body = {
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "fields": ["sensitive_words_dict", "sensitive", "uid"]
    }
    user_sensitive = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(
                index=index_name,
                doc_type=facebook_flow_text_index_type,
                body=sensitive_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_sensitive:
                    user_sensitive[uid] = {
                        'sensitive_dict': {},
                        'sensitive_list': []
                    }
                if content.has_key('sensitive_words_dict'):
                    user_sensitive[uid]['sensitive_dict'] = merge_dict(
                        user_sensitive[uid]['sensitive_dict'],
                        json.loads(content['sensitive_words_dict'][0]))
                if content.has_key('sensitive'):
                    user_sensitive[uid]['sensitive_list'].append(
                        float(content.get('sensitive')[0]))
        except Exception, e:
            print e
Пример #5
0
def update_sentiment(uid_list=[]):
    '''
    SENTIMENT_DICT_NEW = {'0':u'中性', '1':u'积极', '2':u'生气', '3':'焦虑', \
         '4':u'悲伤', '5':u'厌恶', '6':u'消极其他', '7':u'消极'}
    '''
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(
        load_timestamp())
    sentiment_query_body = {
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "fields": ["sentiment", "uid"]
    }
    user_sentiment = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(
                index=index_name,
                doc_type=facebook_flow_text_index_type,
                body=sentiment_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_sentiment:
                    user_sentiment[uid] = {'sentiment_list': []}
                if content.has_key('sentiment'):
                    user_sentiment[uid]['sentiment_list'].append(
                        int(content.get('sentiment')[0]))
        except Exception, e:
            print e
Пример #6
0
def update_hashtag(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(
        load_timestamp())
    keywords_query_body = {
        'query': {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {
                                "terms": {
                                    "uid": uid_list
                                }
                            },
                        ]
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "fields": ["hashtag", "uid"]
    }
    user_hashtag = {}
    for index_name in fb_flow_text_index_list:
        try:
            search_results = es.search(
                index=index_name,
                doc_type=facebook_flow_text_index_type,
                body=keywords_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_hashtag:
                    user_hashtag[uid] = {'hashtag_list': []}
                if content.has_key('hashtag'):
                    hashtag = content['hashtag'][0]
                    if hashtag:
                        hashtag_list = hashtag.split('&')
                        user_hashtag[uid]['hashtag_list'].extend(hashtag_list)
        except Exception, e:
            print e
Пример #7
0
def update_domain(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    fb_flow_text_index_list = get_facebook_flow_text_index_list(load_timestamp())
    user_domain_data = {}
    #load num of text
    count_result = count_text_num(uid_list, fb_flow_text_index_list)
    #load baseinfo
    fb_user_query_body = {
        'query':{
            "filtered":{
                "filter": {
                    "bool": {
                        "must": [
                            {"terms": {"uid": uid_list}},
                        ]
                     }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        "fields": ["bio", "about", "description", "quotes", "category", "uid"]
    }
    try:
        search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=fb_user_query_body)['hits']['hits']
        for item in search_results:
            content = item['fields']
            uid = content['uid'][0]
            if not uid in user_domain_data:
                text_num = count_result[uid]
                user_domain_data[uid] = {
                    'bio_str': '',
                    'bio_list': [],
                    'category': '',
                    'number_of_text': text_num
                }
            #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
            if content.has_key('category'):
                category = content.get('category')[0]
            else:
                category = ''
            if content.has_key('description'):
                description = content.get('description')[0][:1000]  #有的用户描述信息之类的太长了……3000+,没有卵用,而且翻译起来会出现一些问题,截取一部分就行了
            else:
                description = ''
            if content.has_key('quotes'):
                quotes = content.get('quotes')[0][:1000]
            else:
                quotes = ''
            if content.has_key('bio'):
                bio = content.get('bio')[0][:1000]
            else:
                bio = ''
            if content.has_key('about'):
                about = content.get('about')[0][:1000]
            else:
                about = ''    
            user_domain_data[uid]['bio_list'] = [quotes, bio, about, description]
            user_domain_data[uid]['category'] = category
    except Exception,e:
        print e