def compute_attribute(user_weibo_dict):
    # test
    uid_list = user_weibo_dict.keys()
    times = len(uid_list)/1000
    bulk_action = []
    count = 0
    count_list = set()
    for i in range(times+1):
        flow_result = get_flow_information(uid_list[1000*i:1000*(i+1)]) # 流数据更新
        register_result = get_profile_information(uid_list) # 背景信息数据更新
        for user in uid_list:
            weibo_list = user_weibo_dict[user]
            results = compute_text_attribute(user, weibo_list) # 文本属性计算
            results['uid'] = str(user)
            flow_dict = flow_result[str(user)]
            results.update(flow_dict)
            user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']}
            evaluation_index = get_evaluate_index(user_info, status='insert')
            results.update(evaluation_index)
            register_dict = register_result[user]
            results.update(register_dict)
            action = {'index':{'_id':str(user)}}
            bulk_action.extend([action, results])
            count_list.add(user)
            count += 1
            if count % 200 == 0:
                es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60)
                bulk_action = []
                print count
    if bulk_action:
        es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60)
    return "1"
Exemplo n.º 2
0
def compute_attribute(uid_list=[]):
    # test
    user_weibo_dict = read_user_weibo(uid_list)
    uid_list = user_weibo_dict.keys()
    flow_result = get_flow_information(uid_list)
    register_result = get_profile_information(uid_list)
    bulk_action = []
    count = 0
    count_list = set()
    for user in uid_list:
        weibo_list = user_weibo_dict[user]
        uname = weibo_list[0]['uname']
        results = compute_text_attribute(user, weibo_list)
        results['uname'] = uname
        results['uid'] = str(user)
        flow_dict = flow_result[str(user)]
        results.update(flow_dict)
        user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']}
        evaluation_index = get_evaluate_index(user_info, status='insert')
        results.update(evaluation_index)
        register_dict = register_result[user]
        results.update(register_dict)
        action = {'index':{'_id':str(user)}}
        bulk_action.extend([action, results])
        count_list.add(user)
        count += 1
        if count % 200 == 0:
            es.bulk(bulk_action, index=index_name, doc_type="user", timeout=60)
            bulk_action = []
            print count
    if bulk_action:
        status = save_user_results(bulk_action)
    return "1"
def save_text2es():
    count = 0
    bulk_action = []
    user_weibo_dict = dict()
    csvfile = open('./sensitive_uid_text_2.csv', 'rb')
    reader = csv.reader(csvfile)
    for line in reader:
        count += 1
        weibo = dict()
        user = line[0]
        weibo['text'] = line[1].decode('utf-8', 'ignore')
        weibo['mid'] = line[2]
        weibo['geo'] = ip2geo(line[3])
        weibo['timestamp'] = line[4]
        weibo['message_type'] = line[5]
        weibo['uid'] = user
        sentiment = attr_liwc([weibo['text']])
        weibo['sentiment'] = json.dumps(sentiment)
        if not isinstance(weibo['text'], str):
            text = (weibo['text']).encode('utf-8', 'ignore')
        sw_dict = sensitive_words_extract(text)
        if sw_dict:
            weibo['sensitive_words'] = json.dumps(sw_dict)
            weibo['sensitive'] = 1
        else:
            weibo['sensitive'] = 0
        action = {'index':{'_id':weibo['mid']}}
        bulk_action.extend([action, weibo])
        if count % 1000 == 0:
            es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30)
            bulk_action = []
            print count
    if bulk_action:
        es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30)
def week_update_portrait(user_weibo_dict):  # {uid: [weibo_text]}
    uid_list = user_weibo_dict.keys()
    register_result = get_profile_information(uid_list)  # 背景信息数据更新
    bulk_action = []
    count = 0
    for user in uid_list:
        result = dict()
        weibo_list = user_weibo_dict[user]
        register_dict = register_result[user]
        result.update(register_dict)
        # results['domain'] = attri_domain(weibo_list)
        result["domain"] = "test_domain"
        result["domain_string"] = "&".join(result["domain"])
        # psycho_status = attr_psycho_status(user, weibo_list)
        psycho_status = {"positive": 0.5, "negetive": 0.2, "neutral": 0.3}
        result["psycho_status_string"] = "&".join(psycho_status.keys())
        result["psycho_status"] = json.dumps(psycho_status)
        # topic = attr_topic(weibo_list)
        topic = {"政治": 0.3, "民生": 0.7}
        result["topic"] = json.dumps(topic)
        result["topic_string"] = "&".join(result["topic"].keys())
        # politics_trend = attri_politics(user, weibo_list)
        politics_trend = "left"
        result["politics_trend"] = politics_trend
        action = {"update": {"_id": str(user)}}
        results = {"doc": result}
        bulk_action.extend([action, results])
        if count % 1000 == 0:
            es.bulk(bulk_action, index="sensitive_user_portrait", doc_type="user", timeout=60)
            bulk_action = []
            print count
    if bulk_action:
        es.bulk(bulk_action, index="sensitive_user_portrait", doc_type="user", timeout=60)
    return "1"
def save_user_results(bulk_action):
    #print 'bulk_action:', bulk_action[0:2]
    date = ts2datetime(time.time()-24*3600).replace('-','')
    es.bulk(bulk_action, index=index_name, doc_type=index_type)
    """
    for item in bulk_action:
        user = item[0]['index']['_id']
        print user
        status = r.hget('identify_in_sensitive_'+str(date), user)
        if status:
            r.hset('identify_in_sensitive_'+str(date), user, '3')
        else:
            r.hset('identify_in_influence_'+str(date), user, '3')
    """
    return True    
Exemplo n.º 6
0
def save_user_results(bulk_action):
    #print 'bulk_action:', bulk_action[0:2]
    date = ts2datetime(time.time() - 24 * 3600).replace('-', '')
    es.bulk(bulk_action, index=index_name, doc_type=index_type)
    """
    for item in bulk_action:
        user = item[0]['index']['_id']
        print user
        status = r.hget('identify_in_sensitive_'+str(date), user)
        if status:
            r.hset('identify_in_sensitive_'+str(date), user, '3')
        else:
            r.hset('identify_in_influence_'+str(date), user, '3')
    """
    return True
Exemplo n.º 7
0
def update_portrait():
    user_weibo_dict = read_user_weibo()
    uid_list = user_weibo_dict.keys()
    flow_result = get_flow_information(uid_list)
    bulk_action = []
    count = 0
    for user in uid_list:
        action = {'update':{'_id': str(user)}}
        result = {'doc':flow_result[user]}
        bulk_action.extend([action, result])
        count += 1
        if count % 500 == 0:
            es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60)
            bulk_action = []
            print count
    if bulk_action:
        es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60)
    return '1'
def week_update_portrait(user_weibo_dict):  # {uid: [weibo_text]}
    uid_list = user_weibo_dict.keys()
    register_result = get_profile_information(uid_list)  # 背景信息数据更新
    bulk_action = []
    count = 0
    for user in uid_list:
        result = dict()
        weibo_list = user_weibo_dict[user]
        register_dict = register_result[user]
        result.update(register_dict)
        # results['domain'] = attri_domain(weibo_list)
        result['domain'] = 'test_domain'
        result['domain_string'] = "&".join(result['domain'])
        # psycho_status = attr_psycho_status(user, weibo_list)
        psycho_status = {'positive': 0.5, 'negetive': 0.2, 'neutral': 0.3}
        result['psycho_status_string'] = '&'.join(psycho_status.keys())
        result['psycho_status'] = json.dumps(psycho_status)
        # topic = attr_topic(weibo_list)
        topic = {'政治': 0.3, '民生': 0.7}
        result['topic'] = json.dumps(topic)
        result['topic_string'] = '&'.join(result['topic'].keys())
        # politics_trend = attri_politics(user, weibo_list)
        politics_trend = 'left'
        result['politics_trend'] = politics_trend
        action = {'update': {'_id': str(user)}}
        results = {'doc': result}
        bulk_action.extend([action, results])
        if count % 1000 == 0:
            es.bulk(bulk_action,
                    index='sensitive_user_portrait',
                    doc_type='user',
                    timeout=60)
            bulk_action = []
            print count
    if bulk_action:
        es.bulk(bulk_action,
                index='sensitive_user_portrait',
                doc_type='user',
                timeout=60)
    return '1'
def save_text2es():
    count = 0
    bulk_action = []
    user_weibo_dict = dict()
    csvfile = open('./sensitive_uid_text_2.csv', 'rb')
    reader = csv.reader(csvfile)
    for line in reader:
        count += 1
        weibo = dict()
        user = line[0]
        weibo['text'] = line[1].decode('utf-8', 'ignore')
        weibo['mid'] = line[2]
        weibo['geo'] = ip2geo(line[3])
        weibo['timestamp'] = line[4]
        weibo['message_type'] = line[5]
        weibo['uid'] = user
        sentiment = attr_liwc([weibo['text']])
        weibo['sentiment'] = json.dumps(sentiment)
        if not isinstance(weibo['text'], str):
            text = (weibo['text']).encode('utf-8', 'ignore')
        sw_dict = sensitive_words_extract(text)
        if sw_dict:
            weibo['sensitive_words'] = json.dumps(sw_dict)
            weibo['sensitive'] = 1
        else:
            weibo['sensitive'] = 0
        action = {'index': {'_id': weibo['mid']}}
        bulk_action.extend([action, weibo])
        if count % 1000 == 0:
            es.bulk(bulk_action,
                    index='sensitive_user_text',
                    doc_type='user',
                    timeout=30)
            bulk_action = []
            print count
    if bulk_action:
        es.bulk(bulk_action,
                index='sensitive_user_text',
                doc_type='user',
                timeout=30)
Exemplo n.º 10
0
def daily_update_portrait(user_weibo_dict): # {uid: [weibo_text]}
    uid_list = user_weibo_dict.keys()
    bulk_action = []
    count = 0
    for user in uid_list:
        results = dict()
        weibo_list = user_weibo_dict[user]
        flow_result = get_flow_information(user)
        text_result = temporary_text_update(user, weibo_list)
        evaluate_result = evaluate_index(user, status='update')
        results.update(flow_result)
        results.update(text_result)
        results.update(evaluate_result)
        action = {'update':{'_id': str(user)}}
        result = {'doc':results}
        bulk_action.extend([action, result])
        if count % 1000 == 0:
            es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60)
            bulk_action = []
            print count
    if bulk_action:
        es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60)
    return '1'
Exemplo n.º 11
0
                # add sentiment field to weibo
                sentiment = get_sentiment_attribute(text)
                item['sentiment'] = sentiment
                # add hashtag field to weibo
                hashtag_string = get_hashtag_attribute(text)
                if hashtag_string != '':
                    item['hashtag'] = hashtag_string
                # save
                action, xdata = expand_index_action(item)
                bulk_action.extend([action, xdata])
                count += 1

        if count % 1 == 0 and count != 0:
            print 'start bulk_action %s' % count
            es.bulk(bulk_action,
                    index=index_name,
                    doc_type=index_type,
                    timeout=60)
            bulk_action = []
            count = 0

        if read_count % 10000 == 0:
            te = time.time()
            print '[%s] cal speed: %s sec/per %s' % (
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), te - ts, 10000)
            if read_count % 100000 == 0:
                print '[%s] total cal %s, cost %s sec [avg %s per/sec]' % (
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), read_count,
                    te - tb, read_count / (te - tb))
            ts = te
                        word = word.decode('utf-8')
                        word_set.add(word)
                    sensitive_word_string = '&'.join(list(word_set))
                    item['sensitive_word'] = sensitive_word_string
                else:
                    item['sensitive'] = 0
                # add sentiment field to weibo
                sentiment = get_sentiment_attribute(text)
                item['sentiment'] = sentiment
                # add hashtag field to weibo
                hashtag_string = get_hashtag_attribute(text)
                if hashtag_string != '':
                    item['hashtag'] = hashtag_string
                # save
                action, xdata = expand_index_action(item)
                bulk_action.extend([action, xdata])
                count += 1
        
        if count % 1 == 0 and count != 0:
            print 'start bulk_action %s' % count
            es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)
            bulk_action = []
            count = 0
        
        if read_count % 10000 == 0:
            te = time.time()
            print '[%s] cal speed: %s sec/per %s' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), te - ts, 10000) 
            if read_count % 100000 == 0:
                print '[%s] total cal %s, cost %s sec [avg %s per/sec]' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), read_count, te - tb, read_count / (te - tb)) 
            ts = te