def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 count += 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':json.dumps(user_hashtag_dict)} return all_results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 count += 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = { 'hashtag': hashtag_string, 'hashtag_dict': json.dumps(user_hashtag_dict) } return all_results
uidlist = [] f = open("uid_list_0520.txt") for line in f: uid = line.strip() uidlist.append(uid) f.close() data = [] dates = [ "2016-05-14", "2016-05-15", "2016-05-16", "2016-05-17", "2016-05-18", "2016-05-19", "2016-05-20" ] tss = [datetime2ts(d) for d in dates] for ts in tss: ns = "hashtag_" + str(ts) hashtag_list = R_CLUSTER_FLOW3.hmget(ns, uidlist) hashtag_list = [json.loads(h) if h else None for h in hashtag_list] uhlist = zip(uidlist, hashtag_list) uhtlist = [] for uh in uhlist: uh = list(uh) uh.append(ts) uhtlist.append(uh) data.extend(uhtlist) with open("hashtag_0521.txt", "w") as fw: for d in data: if d[1] != None: fw.write("%s\n" % json.dumps(d)) at_data = []
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7,0,-1): ts = now_date_ts - DAY*i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}} if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word] except: today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) return results
from time_utils import datetime2ts,ts2datetime from global_utils import R_CLUSTER_FLOW3, R_CLUSTER_FLOW2, R_CLUSTER_FLOW1 uidlist = [] f = open("uid_list_0520.txt") for line in f: uid = line.strip() uidlist.append(uid) f.close() data = [] dates = ["2016-05-14", "2016-05-15", "2016-05-16", "2016-05-17", "2016-05-18", "2016-05-19", "2016-05-20"] tss = [datetime2ts(d) for d in dates] for ts in tss: ns = "hashtag_" + str(ts) hashtag_list = R_CLUSTER_FLOW3.hmget(ns, uidlist) hashtag_list = [json.loads(h) if h else None for h in hashtag_list] uhlist = zip(uidlist, hashtag_list) uhtlist = [] for uh in uhlist: uh = list(uh) uh.append(ts) uhtlist.append(uh) data.extend(uhtlist) with open("hashtag_0521.txt", "w") as fw: for d in data: if d[1] != None: fw.write("%s\n" % json.dumps(d)) at_data = []