def update_day_sensitive(uid_list): results = {} count = 0 for uid in uid_list: results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-02') today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list) for item in sensitive_results: if not item: count += 1 continue print type(item) uid = uid_list[count] item = json.loads(item) sensitive_index = 0 sensitive_words_dict = {} for word, count in item.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[str(tmp[0])] * count sensitive_words_string = "&".join(item.keys()) results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item} count += 1 return results
def update_day_sensitive(uid_list): results = {} for uid in uid_list: results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-03") today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list) for item in sensitive_results: for uid, words_dict in item.iteritems(): sensitive_index = 0 sensitive_words_dict = {} if words_dict: sensitive_words_dict = json.dumps(words_dict) for word, count in words_dict.iter_items(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[tmp[0]] * count sensitive_words_string = "&".join(sensitive_words_dict.keys()) results[uid] = { "sensitive": sensitive_index, "sensitive_words_string": sensitive_words_string, "sensitive_words_dict": sensitive_words_dict, } return results
def createWordTree(): wordTree = [None for x in range(256)] wordTree.append(0) nodeTree = [wordTree, 0] awords = [] #for b in open('sensitive_words.txt', 'rb'): # awords.append(b.strip()) awords = r.hkeys('sensitive_words') print awords for word in awords: temp = wordTree for a in range(0,len(word)): index = ord(word[a]) if a < (len(word) - 1): if temp[index] == None: node = [[None for x in range(256)],0] temp[index] = node elif temp[index] == 1: node = [[None for x in range(256)],1] temp[index] = node temp = temp[index][0] else: temp[index] = 1 return nodeTree
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def expand_index_action(item): index_body = {} index_body['uid'] = str(item['uid']) index_body['user_fansnum'] = int(item.get('user_fansnum', 0)) index_body['text'] = item['text'] index_body['mid'] = str(item['mid']) index_body['sentiment'] = str(item['sentiment']) index_body['timestamp'] = int(item['timestamp']) index_body['message_type'] = item['message_type'] index_body['keywords_dict'] = item['keywords_dict'] index_body['keywords_string'] = item['keywords_string'] index_body['sensitive_words_string'] = item['sensitive_words_string'] index_body['sensitive_words_dict'] = item['sensitive_words_dict'] index_body['retweeted'] = 0 index_body['comment'] = 0 index_body['sensitive'] = 0 sensitive_words_dict = json.loads(item['sensitive_words_dict']) if sensitive_words_dict: score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v*sensitive_score_dict[str(tmp_stage)] index_body['sensitive'] = score if item['message_type'] == 3: #for retweet message: get directed retweet uname and uid directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid']) if directed_uid: index_body['directed_uid'] = int(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname index_body['root_mid'] = str(item['root_mid']) index_body['root_uid'] = str(item['root_uid']) elif item['message_type'] == 2: #for comment meesage: get directed comment uname and uid directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid']) if directed_uid: index_body['directed_uid'] = int(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname index_body['root_mid'] = str(item['root_mid']) index_body['root_uid'] = str(item['root_uid']) ip = item['send_ip'] index_body['ip'] = ip index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄 action = {'index': {'_id': index_body['mid']}} xdata = index_body return action, xdata
# -*- coding:utf-8 -*- from openpyxl import load_workbook import redis import random import json import sys reload(sys) sys.path.append('../') from global_utils import R_ADMIN as r data = load_workbook('sensitive_words.xlsx') table = data.get_sheet_by_name('Sheet2') category = ['政治', '军事', '法律', '意识形态', '民运'] #print table.cell('A1').value for i in range(1,549): word = table.cell(row=i, column=0).value level = table.cell(row=i, column=1).value index = random.randint(0,4) r.hset('sensitive_words', word, json.dumps([level, category[index]])) #r.hset('recommend_sensitive_words_20130901', '洪秀柱', json.dumps([['1093622153', '3270699555'], 3])) #r.hset('recommend_sensitive_words_20130901', '港灿', json.dumps([['1686546714'], 1])) #r.hset('recommend_sensitive_words_20130901', '钟屿晨', json.dumps([['1887344341', '3183040584'], 2]))
# -*- coding:utf-8 -*- from openpyxl import load_workbook import redis import random import json import sys reload(sys) sys.path.append('../') from global_utils import R_ADMIN as r data = load_workbook('sensitive_words.xlsx') table = data.get_sheet_by_name('Sheet2') category = ['政治', '军事', '法律', '意识形态', '民运'] #print table.cell('A1').value for i in range(1, 549): word = table.cell(row=i, column=0).value level = table.cell(row=i, column=1).value index = random.randint(0, 4) r.hset('sensitive_words', word, json.dumps([level, category[index]])) #r.hset('recommend_sensitive_words_20130901', '洪秀柱', json.dumps([['1093622153', '3270699555'], 3])) #r.hset('recommend_sensitive_words_20130901', '港灿', json.dumps([['1686546714'], 1])) #r.hset('recommend_sensitive_words_20130901', '钟屿晨', json.dumps([['1887344341', '3183040584'], 2]))
# -*- coding:utf-8 -*- from openpyxl import load_workbook import redis import json import sys reload(sys) sys.path.append('../../') from global_utils import R_ADMIN as r #r = redis.StrictRedis(host="10.128.55.69", port="6379", db=15) data = load_workbook('sensitive_words.xlsx') table = data.get_sheet_by_name('Sheet2') for i in range(1,549): word = table.cell(row=i, column=0).value level = table.cell(row=i, column=1).value r.hset('sensitive_words',word, level) print r.hkeys('sensitive_words')
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e
def compute_topic_task(): print time.time() while True: #print r.rpop(topic_queue_name) task = r.rpop(topic_queue_name) if not task: break else: task = json.loads(task) print task topic = task['name'] en_name = task['en_name'] start_ts = int(task['start_ts']) #timestamp end_ts = int(task['end_ts']) #timestamp submit_user = task['submit_user'] comput_status = task['comput_status'] task_id = str(start_ts) + '_' + str( end_ts) + '_' + en_name + '_' + submit_user exist_flag = exist(task_id) #get_topic_weibo(topic,en_name,start_ts,end_ts) if exist_flag: #start compute #try: weibo_es.update(index=topic_index_name, doc_type=topic_index_type, id=task_id, body={'doc': { 'comput_status': -1 }}) print 'finish change status' #geo repost_search(en_name, start_ts, end_ts) print 'finish geo_1 analyze' cityTopic(en_name, start_ts, end_ts) print 'finish geo analyze' #language count_fre(en_name, start_ts=start_ts, over_ts=end_ts, news_limit=NEWS_LIMIT, weibo_limit=MAX_LANGUAGE_WEIBO) print 'finish language analyze' #time propagateCronTopic(en_name, start_ts, end_ts) print 'finish time analyze' #network compute_network(en_name, start_ts, end_ts) print 'finish network analyze' #sentiment sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts) print 'finish sentiment analyze' #finish compute print weibo_es.update(index=topic_index_name, doc_type=topic_index_type, id=task_id, body={ 'doc': { 'comput_status': 1, 'finish_ts': int(time.time()) } }) print 'finish change status done' # except: # raise # break else: pass
def expand_index_action(item): index_body = {} index_body['uid'] = str(item['uid']) index_body['text'] = item['text'] index_body['tid'] = str(item['tid']) index_body['sentiment'] = str(item['sentiment']) index_body['timestamp'] = int(item['timestamp']) #index_body['message_type'] = item['message_type'] index_body['keywords_dict'] = item['keywords_dict'] index_body['keywords_string'] = item['keywords_string'] index_body['sensitive_words_string'] = item['sensitive_words_string'] index_body['sensitive_words_dict'] = item['sensitive_words_dict'] sensitive_words_dict = json.loads(item['sensitive_words_dict']) score = 0 if sensitive_words_dict: #score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] #index_body['sensitive'] = score index_body['sensitive'] = score #for retweet message: get directed retweet uname and uid # directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid']) directed_uid, directed_uname = get_root_retweet(item['text'], item['uid']) if directed_uid: index_body['directed_uid'] = long(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname # if item['message_type'] == 3: # #for retweet message: get directed retweet uname and uid # directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid']) # if directed_uid: # index_body['directed_uid'] = int(directed_uid) # else: # #index_body['directed_uid'] = directed_uid # index_body['directed_uid'] = 0 # index_body['directed_uname'] = directed_uname # index_body['root_tid'] = str(item['root_tid']) # index_body['root_uid'] = str(item['root_uid']) # elif item['message_type'] == 2: # #for comment meesage: get directed comment uname and uid # directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid']) # if directed_uid: # index_body['directed_uid'] = int(directed_uid) # else: # #index_body['directed_uid'] = directed_uid # index_body['directed_uid'] = 0 # index_body['directed_uname'] = directed_uname # index_body['root_tid'] = str(item['root_tid']) # index_body['root_uid'] = str(item['root_uid']) # ip = item['send_ip'] # index_body['ip'] = ip # index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄 action = {'index': {'_id': index_body['tid']}} xdata = index_body return action, xdata
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7, 0, -1): ts = now_date_ts - DAY * i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {} } #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict','text'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][ keywords] = uid_keywords_dict[keywords] #jln filter keyword 2016/11/08 weibo_text = json.loads(item['fields']['text'][0]) filter_keywords_dict = get_weibo_single(weibo_text) for keywords in filter_keywords_dict: try: iter_results[uid]['filter_keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['filter_keywords'][ keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) filter_keywords_dict = iter_results[uid]['filter_keywords'] f_keywords_top50 = sorted(filter_keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] f_keywords_top50_string = '&'.join( [filter_keywords_dict[0] for keyword_item in f_keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string results[uid]['filter_keywords'] = json.dumps(f_keywords_top50) results[uid]['filter_keywords_string'] = f_keywords_top50_string return results
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 now_ts = datetime2ts('2016-03-24') ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count
def main(): if RUN_TYPE: now_ts = time.time() - DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2013-09-02')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts + "_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_" + str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r_cluster.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids": uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads( sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: tmp_stage = json.loads(tmp_stage) current_sensitive_score += v * sensitive_score_dict[ str(tmp_stage[0])] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.pop(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[ update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item[ 'sensitive_day_change'] = current_sensitive_score - revise_item.get( former_sensitive_key, 0) revise_item[ 'sensitive_week_change'] = current_sensitive_score - revise_item.get( 'sensitive_week_ave', 0) revise_item[ 'sensitive_month_change'] = current_sensitive_score - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[ update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) revise_item[ 'sensitive_day_change'] = current_sensitive_score revise_item[ 'sensitive_week_change'] = current_sensitive_score revise_item[ 'sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count #######更新尚未完成的用户 update_scan = scan(es, query={ "query": { "filtered": { "filter": { "missing": { "field": update_sensitive_key } } } } }, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) iter_count = 0 bulk_action = [] while 1: try: tmp = update_scan.next() revise_item = tmp['_source'] if del_sensitive_key in revise_item: revise_item.pop(del_sensitive_key) uid = tmp['_id'] # 新更新的敏感度 revise_item[update_sensitive_key] = 0 revise_item['last_value'] = 0 # 新更新的敏感词 revise_item[sensitive_dict_key] = json.dumps({}) # 新更新的string revise_item[sensitive_string_key] = "" # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = 0 - revise_item.get( former_sensitive_key, 0) revise_item['sensitive_week_change'] = 0 - revise_item.get( 'sensitive_week_ave', 0) revise_item['sensitive_month_change'] = 0 - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] except StopIteration: print "all done" if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) break except Exception, r: print Exception, r
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7,0,-1): ts = now_date_ts - DAY*i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
# -*- coding:utf-8 -*- from openpyxl import load_workbook import redis import json import sys reload(sys) sys.path.append('../../') from global_utils import R_ADMIN as r #r = redis.StrictRedis(host="10.128.55.69", port="6379", db=15) """ data = load_workbook('sensitive_words.xlsx') table = data.get_sheet_by_name('Sheet2') for i in range(1,549): word = table.cell(row=i, column=0).value level = table.cell(row=i, column=1).value r.hset('sensitive_words',word, level) print r.hkeys('sensitive_words') """ r.hset("sensitive_words", ("疫苗之殇").decode('utf-8').encode('utf-8'), 1)
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}} if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word] except: today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) return results
# -*- coding:utf-8 -*- from openpyxl import load_workbook import redis import json import sys reload(sys) sys.path.append('../../') from global_utils import R_ADMIN as r #r = redis.StrictRedis(host="10.128.55.69", port="6379", db=15) data = load_workbook('sensitive_words.xlsx') table = data.get_sheet_by_name('Sheet2') for i in range(1, 549): word = table.cell(row=i, column=0).value level = table.cell(row=i, column=1).value r.hset('sensitive_words', word, level) print r.hkeys('sensitive_words')
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts print 'run_type:', RUN_TYPE for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #print 'ip_results:', ip_results #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {}, 'school': {}, 'week_ip': { 0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {} }, 'ip': {} } if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][ sensitive_word] += uid_sensitive_dict[ sensitive_word] except: today_sensitive_results[uid][ sensitive_word] = uid_sensitive_dict[ sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count #deal ip: job_ip&home_ip&active_ip ip_time_list = uid_ip_dict[ip].split('&') try: iter_results[uid]['ip'][ip] += ip_count except: iter_results[uid]['ip'] = {ip: ip_count} for ip_time_item in ip_time_list: ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT try: iter_results[uid]['week_ip'][ip_timesegment][ip] += 1 except: iter_results[uid]['week_ip'][ip_timesegment][ip] = 1 #end deal ip iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: #print 'test iter_results_ip:', iter_results[uid]['week_ip'] results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) #ip: job_ip&home_ip&activity_ip #activity_ip all_ip_dict = iter_results[uid]['ip'] sort_all_ip = sorted(all_ip_dict.items(), key=lambda x: x[1], reverse=True) try: activity_ip = sort_all_ip[0][0] except: activity_ip = '' results[uid]['activity_ip'] = str(activity_ip) #job_ip & home_ip week_time_ip_dict = iter_results[uid]['week_ip'] for i in range(0, 6): try: segment_dict = week_time_ip_dict[i] except: week_time_ip_dict[i] = {} home_ip, job_ip = get_ip_description(week_time_ip_dict) results[uid]['home_ip'] = str(home_ip) results[uid]['job_ip'] = str(job_ip) return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) # activity_index_name = act_index_pre + str(date) # sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name) # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list} )["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget("ip_" + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list) # activity_results = redis_activity.hmget('activity_'+str(date), uid_list) # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = { "hashtag": {}, "sensitive_hashtag": {}, "geo": {}, "sensitive_geo": {}, "geo_track": [], "keywords": {}, "sensitive_words": {}, "sensitive_geo_track": [], "ip": [], "sensitive_ip": [], } # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word] except: iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word] # print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag] except: iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag] # print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag] # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]["found"]: detail_item = ip_results[j]["_source"] ip_dict = json.loads(detail_item["ip_dict"]) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]["geo"][geo] += count except: iter_results[uid]["geo"][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count # iter_results[uid]['ip'].append(ip_dict) iter_results[uid]["geo_track"].append(uid_day_geo[uid]) # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]["found"]: detail_item = sensitive_ip_results[j]["_source"] sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"]) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]["sensitive_geo"][geo] += count except: iter_results[uid]["sensitive_geo"][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid]) # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search( index=flow_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["uid", "keywords_dict"], )["hits"]["hits"] else: text_results = {} for item in text_results: uid = item["fields"]["uid"][0] uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0]) for keywords in uid_keywords_dict: try: iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords] # print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]["hashtag"] results[uid]["hashtag_dict"] = json.dumps(hashtag_dict) results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"] results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict) results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]["sensitive_words"] results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict) results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget("sensitive_words", k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]["sensitive"] = sensitive_score # geo geo_dict = iter_results[uid]["geo"] geo_track_list = iter_results[uid]["geo_track"] results[uid]["activity_geo_dict"] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys]) results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]["sensitive_geo"] sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"] results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]["sensitive_activity_geo"] = "&".join( ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys] ) results[uid]["sensitive_activity_geo_aggs"] = "&".join( [item.split("\t")[-1] for item in sensitive_geo_dict_keys] ) keywords_dict = iter_results[uid]["keywords"] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]["keywords_dict"] = json.dumps(keywords_top50) results[uid]["keywords_string"] = keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i print ts uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {} } #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for sensitive_item in sensitive_word_dict: k = sensitive_item v = sensitive_word_dict[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
'query':{ 'term':{'uid':uid} }, 'size':50 } index_name = flow_text_index_name_pre + ts2datetime(timestamp) try: search_results = es_xnr.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] except Exception,e: pass search_results = [] for result in search_results: text = result['_source']['text'].encode('utf-8') node = createWordTree() sensitive_words_dict = searchWord(text, node) if sensitive_words_dict: sensitive_words_list = [] for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v*sensitive_score_dict[str(tmp_stage)] return score if __name__ == '__main__': # '2017-10-15' # print get_sensitive_user(timestamp=1507996800, uid='100003271864059') print get_sensitive_info(timestamp=1507996800,mid='123124323',text=u"64和达赖太阳花") print get_sensitive_info(timestamp=1507996800,mid='123124323') print get_sensitive_info(timestamp=1507996800,text=u"64和达赖太阳花") print get_sensitive_info(timestamp=1507996800,)
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2013-09-02')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r_cluster.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: tmp_stage = json.loads(tmp_stage) current_sensitive_score += v*sensitive_score_dict[str(tmp_stage[0])] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.pop(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count #######更新尚未完成的用户 update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) iter_count = 0 bulk_action = [] while 1: try: tmp = update_scan.next() revise_item = tmp['_source'] if del_sensitive_key in revise_item: revise_item.pop(del_sensitive_key) uid = tmp['_id'] # 新更新的敏感度 revise_item[update_sensitive_key] = 0 revise_item['last_value'] = 0 # 新更新的敏感词 revise_item[sensitive_dict_key] = json.dumps({}) # 新更新的string revise_item[sensitive_string_key] = "" # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = 0 - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = 0 - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = 0 - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] except StopIteration: print "all done" if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) break except Exception, r: print Exception, r