Exemplo n.º 1
0
def save_data2es(data):
    update_uid_list = []
    create_uid_list = []
    try:
        for uid, d in data.items():
            if es.exists(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, id=uid):
                update_uid_list.append(uid)
            else:
                create_uid_list.append(uid)
        #bulk create
        bulk_create_action = []
        if create_uid_list:
            for uid in create_uid_list:
                create_action = {'index':{'_id': uid}}
                bulk_create_action.extend([create_action, data[uid]])
            result = es.bulk(bulk_create_action, index=tw_portrait_index_name, doc_type=tw_portrait_index_type)
            if result['errors'] :
                print result
                return False
        #bulk update
        if update_uid_list:
            bulk_update_action = []
            for uid in update_uid_list:
                update_action = {'update':{'_id': uid}}
                bulk_update_action.extend([update_action, {'doc': data[uid]}])
            result = es.bulk(bulk_update_action, index=tw_portrait_index_name, doc_type=tw_portrait_index_type)
            if result['errors'] :
                print result
                return False
    except Exception,e:
        print e
        return False
Exemplo n.º 2
0
def my_bulk_func(bulk_action, index_name, doc_type):
    # bulk_action: [action,source_item,action,source_item,...]
    try:
        es.bulk(bulk_action,index=index_name,doc_type=doc_type,timeout=600)
    except Exception,e: #如果出现错误,就减小存储的批次,再次出现问题的批次直接放弃
        # print 'my_bulk_func Exception: ', str(e)
        for i in range(len(bulk_action)/2):
            temp_bulk_action = bulk_action[2*i : 2*i+2]
            try:
                es.bulk(temp_bulk_action,index=index_name,doc_type=doc_type,timeout=600)
            except:
                pass
Exemplo n.º 3
0
def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e
Exemplo n.º 4
0
def update_chinese_user(index_name_pre, index_type, user_index_name,
                        user_index_type, date_list):
    #user表中的chinese_user字段为1,代表是中文用户
    chinese_user_list = load_chinese_user(user_index_name, user_index_type)
    all_user_list = load_chinese_user(user_index_name, user_index_type, {
        'query': {
            'match_all': {},
        },
        'size': 99999,
    })
    unchinese_user_list = list(set(all_user_list) - set(chinese_user_list))

    #需要进行update操作(更新chinese_user为1或0)的user_list
    update_list = []

    #流数据表
    flow_text_index_list = []
    for date in date_list:
        flow_text_index_list.append(index_name_pre + date)

    #对于chinese_user不为1的用户
    #当某个uid对应的所有flow_text中,中文帖子的比例占到一定数量,并且帖子数量大于一个阈值的时候,更新为chinese_user=1
    for uid in unchinese_user_list:
        text_num = count_flow_text_num(uid, flow_text_index_list, index_type)
        if text_num >= 20:
            text_num_ch = count_flow_text_num(uid, flow_text_index_list,
                                              index_type, True)
            if float(text_num_ch) / text_num > 0.9:
                update_list.append((uid, 1))

    #对于chinese_user为1的用户,有选择的进行操作:更新为chinese_user=0
    for uid in chinese_user_list:
        text_num = count_flow_text_num(uid, flow_text_index_list, index_type)
        if text_num:
            text_num += 1  #以防分母为0
            text_num_ch = count_flow_text_num(uid, flow_text_index_list,
                                              index_type, True)
            if float(text_num_ch) / text_num < 0.9:  #可以不发帖,但是如果发帖,中文帖子比例一定要高
                update_list.append((uid, 0))

    #统一进行update操作
    if update_list:
        bulk_update_action = []
        count = 0
        for uid, flag in update_list:
            update_action = {'update': {'_id': uid}}
            bulk_update_action.extend(
                [update_action, {
                    'doc': {
                        'chinese_user': flag
                    }
                }])
            count += 1
            if count % 1000 == 0:
                es.bulk(bulk_update_action,
                        index=user_index_name,
                        doc_type=user_index_type)
                bulk_update_action = []
        if bulk_update_action:
            es.bulk(bulk_update_action,
                    index=user_index_name,
                    doc_type=user_index_type)
    return True
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
import redis
import sys
sys.path.append('/home/xnr1/xnr_0429/xnr')
from global_utils import es_xnr_2 as es

r_ali = redis.Redis(host='60.205.190.67', port=6379, db=0)
r = r_ali

redis_task_es = 'redis_task_es'

if r_ali.llen(redis_task_es):
    data = r.rpop(redis_task_es)
    if data:
        data = eval(data)
        bulk_action = data['bulk_action']
        index_name = data['index_name']
        doc_type = data['doc_type']

        try:
            print es.bulk(bulk_action,index=index_name,doc_type=doc_type,timeout=600)
        except Exception,e: #如果出现错误,就减小存储的批次,再次出现问题的批次直接放弃
            print 'my_bulk_func Exception: ', str(e)
            for i in range(len(bulk_action)/2):
                temp_bulk_action = bulk_action[2*i : 2*i+2]
                try:
                    es.bulk(temp_bulk_action,index=index_name,doc_type=doc_type,timeout=600)
                except:
                    pass

Exemplo n.º 6
0
def influence_cal_tw(current_time):

	# if S_TYPE == 'test' :
	# 	current_time = datetime2ts(S_DATE_FB)

	current_date = ts2datetime(current_time)
	current_time = datetime2ts(current_date)
	
	flow_text_index_name = twitter_flow_text_index_name_pre + current_date
	count_index_name = twitter_count_index_name_pre + current_date
	tw_bci_index_name = tw_bci_index_name_pre + current_date

	tw_bci_mappings(tw_bci_index_name)

	uid_tid_dict = {}
	bulk_action = []

	# 找出要计算活跃的uids -- 流数据
	query_body_text = {
		'query':{
			'match_all':{}
		}
	}

	es_scan_result = scan(es,index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\
							query=query_body_text,size=1000)
	#print 'es_scan_result...',es_scan_result
	while 1:
		try:
			scan_data = es_scan_result.next()
			item = scan_data['_source']
			uid = item['uid']
			tid = item['tid']

			try:
				uid_tid_dict[uid].append(tid)
			except:
				uid_tid_dict[uid] = [tid]

		except StopIteration:
			break

	# # 
	# query_body_count = {
	# 	'query':{
	# 		'match_all':{}
	# 	}
	# }

	# es_scan_result_count = scan(es,index=count_index_name,doc_type=twitter_count_index_type,\
	# 						size=1000,query=query_body_count)

	# while 1:
	# 	try:
	# 		scan_data_count = es_scan_result_count.next()
	# 		item = scan_data_count['_source']
	# 		tid = item['tid']
	# 		uid = item['uid']
	# 		try:
	# 			if tid in uid_tid_dict[uid]:
	# 				continue
	# 			else:
	# 				uid_tid_dict[uid].append(tid)
	# 		except:
	# 			uid_tid_dict[uid] = [tid]

	# 	except StopIteration:
	# 		break

	count  = 0

	
	for uid, tid_list in uid_tid_dict.iteritems():

		# 活跃度 -- 每天活跃数
		active_num = influence_active(uid,flow_text_index_name)
		#active_num = len(tid_list)

		# 传播力 -- 每天收到反馈数
		propagate_num_sum = 1
		for tid in tid_list:
			propagate_num = influence_propagate(tid,count_index_name)
			propagate_num_sum += propagate_num

		# 覆盖度 -- 活跃粉丝数
		cover_num = influence_cover(uid,flow_text_index_name)

		# 可信度 -- category(团体组织)是否为空,不为空则1,为空则0。
		trust_num = influence_trust(uid)
		# print 'propagate_num_sum...',propagate_num_sum
		# print 'cover_num...',cover_num
		
		influence_mark = (active_num + math.log10(propagate_num_sum) + math.log10(cover_num) + trust_num) * 10

		action = {'index':{'_id':uid}}

		user_items = {}
		user_items['active'] = active_num
		user_items['propagate'] = propagate_num_sum
		user_items['cover'] = cover_num
		user_items['trust'] = trust_num
		user_items['influence'] = influence_mark
		user_items['uid'] = uid
		user_items['timestamp'] = current_time

		bulk_action.extend([action,user_items])

		count += 1

		if count % 1000:
			es.bulk(bulk_action,index=tw_bci_index_name,doc_type=tw_bci_index_type,timeout=400)
			bulk_action = []

	if bulk_action:
		es.bulk(bulk_action,index=tw_bci_index_name,doc_type=tw_bci_index_type,timeout=400)
Exemplo n.º 7
0
def scan_retweet(ft_type):
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)

    if ft_type == 'fb':
        retweet_redis_dict = fb_retweet_dict
        index_name = 'fb_be_retweet_' +str(db_number)
    else:
        retweet_redis_dict = tw_retweet_dict
        index_name = 'tw_be_retweet_' +str(db_number)

    #get redis db
    retweet_redis = retweet_redis_dict[str(db_number)]
    # # 1. 判断即将切换的db中是否有数据
    # while 1:
    #     redis_host_list.pop(db_number)
    #     other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis
    #     current_dbsize = other_db_number.dbsize()
    #     if current_dbsize:
    #         break # 已经开始写入新的db,说明前一天的数据已经写完
    #     else:
    #         time.sleep(60)

    # 2. 删除之前的es
    
    be_retweet_es_mappings(str(db_number),ft_type)
    
    
    # 3. scan

    retweet_bulk_action = []
    be_retweet_bulk_action = []
    start_ts = time.time()
    #retweet count/be_retweet count
    #retweet_count = 0
    be_retweet_count = 0
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        '''
        if re_scan_cursor == 0:
            print 'scan finish'
            if retweet_bulk_action != []:
                es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user')
            if be_retweet_bulk_action != []:
                es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user')
            break
        '''
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
           
            if len(item_list)==3:
                be_retweet_count += 1
                uid = item_list[2]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_retweet'] = json.dumps(item_result)
                be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
        #print 'be_retweet_bulk_action...',be_retweet_bulk_action
        if be_retweet_bulk_action:
            es.bulk(be_retweet_bulk_action, index=index_name, doc_type='user')
        else:
            break
        retweet_bulk_action = []
        be_retweet_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user:'******'count:', count

    
    # flushdb

    retweet_redis.flushdb()
    
    print 'end'