def get_influence(uid):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts - 3600*24)
    # test
    now_date = '2013-09-07'
    index_time = ''.join(now_date.split('-'))
    index_type = 'bci'
    try:
        result = es.get(index=index_time, id=uid, doc_type=index_type)['_source']['user_index']
        #print 'result_dict:', result
        '''
        query_body = {
        'query':{
            'filtered':{
                'query':{
                    'match_all':{}
                    },
                'filter':{
                    'range':{
                        'user_index':{
                            'gte':result
                            }
                        }
                    }
            }
        }
        }
        rank = es.count(index=index_time, doc_type=index_type, body=query_body)['count']
        #print 'rank:', rank
        '''
    except:
        return 0
    return result
def get_influence(uid):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts - 3600*24)
    # test
    #now_date = '2013-09-07'
    index_time = ''.join(now_date.split('-'))
    index_type = 'bci'
    try:
        result = es.get(index=index_time, id=uid, doc_type=index_type)['_source']['user_index']
        #print 'result_dict:', result
        '''
        query_body = {
        'query':{
            'filtered':{
                'query':{
                    'match_all':{}
                    },
                'filter':{
                    'range':{
                        'user_index':{
                            'gte':result
                            }
                        }
                    }
            }
        }
        }
        rank = es.count(index=index_time, doc_type=index_type, body=query_body)['count']
        #print 'rank:', rank
        '''
    except:
        return 0
    return result
def get_importance(uid, domain, topic):
    result = 0
    domain_result = 0
    domain_list = domain.split(' ')
    #print 'domain_list:', domain_list
    for domain in domain_list:
        try:
            domain_result += domain_weight_dict[domain]
        except:
            pass
    topic_result = 0
    topic_list = topic.split(' ')
    #print 'topic_list:', topic_list
    for topic in topic_list:
        try:
            topic_result += topic_weight_dict[topic]
        except:
            pass
    #get fansnum, origin_weibo_retweeted_total_number, retweeted_weibo_retweeted_total_number
    now_ts = time.time()
    date = ts2datetime(now_ts-3600*24)
    #test 
    date = '2013-09-07'
    index_time = ''.join(date.split('-'))
    index_type = 'bci'
    try:
        es_result = es.get(index=index_time, doc_type=index_type, id=uid)['_source']
        fansnum = es_result['user_fansnum']
        retweetednum = es_result['origin_weibo_retweeted_total_number'] + es_result['retweeted_weibo_retweeted_total_number']
        result = importance_weight_dict['fansnum']*fansnum + importance_weight_dict['retweeted_num']*retweetednum + \
             importance_weight_dict['domain']*domain_result + importance_weight_dict['topic']*topic_result
        #print 'importance result:', result
        return result
    except:
        return 0
def get_importance(uid, domain, topic):
    result = 0
    domain_result = 0
    domain_list = domain.split(' ')
    #print 'domain_list:', domain_list
    for domain in domain_list:
        try:
            domain_result += domain_weight_dict[domain]
        except:
            pass
    topic_result = 0
    topic_list = topic.split(' ')
    #print 'topic_list:', topic_list
    for topic in topic_list:
        try:
            topic_result += topic_weight_dict[topic]
        except:
            pass
    #get fansnum, origin_weibo_retweeted_total_number, retweeted_weibo_retweeted_total_number
    now_ts = time.time()
    date = ts2datetime(now_ts-3600*24)
    #test 
    date = '2013-09-07'
    index_time = ''.join(date.split('-'))
    index_type = 'bci'
    try:
        es_result = es.get(index=index_time, doc_type=index_type, id=uid)['_source']
        fansnum = es_result['user_fansnum']
        retweetednum = es_result['origin_weibo_retweeted_total_number'] + es_result['retweeted_weibo_retweeted_total_number']
        result = importance_weight_dict['fansnum']*fansnum + importance_weight_dict['retweeted_num']*retweetednum + \
             importance_weight_dict['domain']*domain_result + importance_weight_dict['topic']*topic_result
        #print 'importance result:', result
        return result
    except:
        return 0
예제 #5
0
    result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)['hits']['hits']
    sensitive_uid = []
    for item in result:
        sensitive_uid.append(item['_source']['uid'])

    return sensitive_uid


if __name__ == '__main__':
    '''
    f = open('sensitive_uid_list.txt', 'wb')
    uid_list = search_sensitive_weibo('20130904')
    for uid in uid_list:
        f.write(str(uid) + '\n')
    f.close()
    '''

    f = open('sensitive_uid_list.txt', 'rb')
    for line in f:
        uid = line.strip()
        try:
            result = es_cluster.get(index='sensitive_user_portrait', doc_type='user', id=uid)['_source']
        except:
            print uid
            continue
        if result['sensitive_words_string']:
            es.update(index='sensitive_user_portrait', doc_type='user', id=uid, body={"doc":{"type":1}})
        else:
            es.update(index='sensitive_user_portrait', doc_type='user', id=uid, body={"doc":{"type":0}})