def main(): RUN_TYPE = 0 if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) now_ts = datetime2ts('2013-09-02') date = ts2datetime(now_ts - DAY) # auto recommendation: step 1:4 #step1: read from top es_daily_rank top_user_set, user_dict = search_from_es(date) #step2: filter black_uid black_user_set = read_black_user() subtract_user_set = top_user_set - black_user_set #step3: filter users have been in subtract_user_set = list(subtract_user_set) candidate_results = filter_in(subtract_user_set) #step4: filter rules about ip count& reposts/bereposts count&activity count results = filter_rules(candidate_results) #step5: get sensitive user sensitive_user = list(get_sensitive_user(date)) results = results - set(sensitive_user) # influence user - sensitive user new_date = ts2datetime(now_ts) hashname_influence = "recomment_" + new_date + "_influence" if results: for uid in results: #print uid r.hset(hashname_influence, uid, "0") hashname_sensitive = "recomment_" + new_date + "_sensitive" if sensitive_user: for uid in sensitive_user: #print "sensitive" r.hset(hashname_sensitive, uid, "0") """
def filter_recommend(top_user_set): recommend_keys = r.hkeys('recommend') recommend_list = [] for key in recommend_keys: recommend_list.extend(json.loads(r.hget('recommend', key))) results = set(top_user_set) - set(recommend_list) return results
def change_status_compute_fail(mapping_dict): hash_name = 'compute' status = 1 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '1' new_mapping_dict[uid] = json.dumps(user_list) r.hmset(hashname, new_mapping_dict)
def change_status_computed(mapping_dict): hash_name = "compute" status = 4 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = "4" new_mapping_dict[uid] = json.dumps(user_list) r.hmset(hash_name, new_mapping_dict)
def change_status_compute_fail(mapping_dict): hash_name = 'compute' status = 2 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '2' new_mapping_dict[uid] = json.dumps(user_list) r.hmset(hashname, new_mapping_dict)
def change_status_computed(mapping_dict): hash_name = r_user_hash_name status = 4 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '4' new_mapping_dict[uid] = json.dumps(user_list) r.hmset(hash_name, new_mapping_dict)
def change_status_compute_fail(mapping_dict): hash_name = 'compute' status = 1 new_mapping_dict = {} for uid in mapping_dict: user_list = mapping_dict[uid] user_list[1] = '1' new_mapping_dict[uid] = json.dumps(user_list) r.mset(hashname, new_mapping_dict)
def scan_compute_redis(): hash_name = "compute" results = r.hgetall("compute") iter_user_list = [] mapping_dict = dict() for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == "2": iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, "3"]) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: # mark status from 2 to 3 as identify_compute to computing r.hmset("compute", mapping_dict) # acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list ) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list ) # compute text attribute compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts ) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset("compute", mapping_dict) # acquire bulk user weibo date if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list ) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list ) # compute text attribute compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts ) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict)
def get_recommentation(admin_user): submit_recommentation_count = 0 compute_count = 0 search_date = ts2datetime(time.time() - DAY) submit_recomment_key = 'recomment_' + admin_user + '_' + search_date submit_user_recomment = set(R_RECOMMENTATION.hkeys(submit_recomment_key)) all_compute_set = set(R_RECOMMENTATION.hkeys('compute')) submit_recommentation_count = len(submit_user_recomment) compute_count = len(submit_user_recomment & all_compute_set) return submit_recommentation_count, compute_count
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') iter_user_list = [] mapping_dict = dict() for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == '2': iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: #mark status from 2 to 3 as identify_compute to computing r.hmset('compute', mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset('compute', mapping_dict) #acquire bulk user weibo date if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict)
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') iter_user_list = [] mapping_dict = dict() for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == '2': iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: #mark status from 1 to 3 as identify_compute to computing r.hmset('compute', mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status==True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #deal user no weibo to compute portrait attribute if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set(user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([in_date, '2']) r.hmset(change_mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset('compute', mapping_dict) #acquire bulk user weibo date if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status==True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #deal user no weibo to compute portrait attribute if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set(user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([in_date, '2']) r.hmset(change_mapping_dict)
def save_results(save_type, user, recomment_results): save_mark = False #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME)) recomment_hash_name = 'recomment_' + now_date + '_auto' #print 'save operation results' R_RECOMMENTATION.hset(recomment_hash_name, user, json.dumps(recomment_results)) save_mark = True return save_mark
def read_uid_list(): date = ts2datetime(time.time()-24*3600) date = date.replace('-','') sensitive_dict = r.hgetall('identify_in_sensitive_'+str(date)) influence_dict = r.hgetall('identify_in_influence_'+str(date)) uid_list = [] for uid in sensitive_dict: if sensitive_dict[uid] != '3': uid_list.append(uid) for uid in influence_dict: if influence_dict[uid] != '3': uid_list.append(uid) return uid_list
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') iter_user_list = [] mapping_dict = dict() #test count = 0 for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == '2': #imme #test #count += 1 #if count >= 3: # break iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: r.hmset('compute', mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict = read_flow_text_sentiment(iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict = read_flow_text(iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict) if compute_status==True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset('compute', mapping_dict) #acquire bulk user weibo date if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict = read_flow_text_sentiment(iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict = read_flow_text(iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict) if compute_status==True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict)
def save_results(save_type, recomment_results): save_mark = False #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) recomment_hash_name = 'recomment_' + now_date + '_auto' if save_type == 'hotspot': #print 'save hotspot results' R_RECOMMENTATION.hset(recomment_hash_name, 'auto', json.dumps(recomment_results)) save_mark = True elif save_type == 'operation': #print 'save operation results' R_RECOMMENTATION.hmset(recomment_hash_name, recomment_results) save_mark = True return save_mark
def main(): now_ts = time.time() delete_ts = datetime2ts(ts2datetime(now_ts-EXPIRE_TIME)) #待删除的时间戳 delete_date = ts2datetime(now_ts-EXPIRE_TIME) del_day = ts2datetime(now_ts-MONTH_TIME) index_name = flow_text_index_name_pre + del_day exist_es = es_flow_text.indices.exists(index=index_name) if exist_es: es_flow_text.indices.delete(index=index_name) index_bci = "bci_" + del_day.replace('-', '') exist_bci = ES_CLUSTER_FLOW1.indices.exists(index=index_bci) if exist_bci: ES_CLUSTER_FLOW1.indices.delete(index=index_bci) #delete @ redis_cluster.delete("at_"+str(delete_ts)) redis_cluster.delete("sensitive_at_"+str(delete_ts)) #delete ip redis_ip.delete('ip_'+str(delete_ts)) if WORK_TYPE == 0: exist_ip = es_cluster.indices.exists(index="ip_"+delete_date) if exist_ip: es_cluster.indices.delete(index="ip_"+delete_date) redis_ip.delete('sensitive_ip_'+str(delete_ts)) if WORK_TYPE == 0: exist_ip = es_cluster.indices.exists(index="sensitive_ip_"+delete_date) if exist_ip: es_cluster.indices.delete(index="sensitive_ip_"+delete_date) #delete activity redis_activity.delete('activity_'+str(delete_ts)) if WORK_TYPE == 0: exist_activity = es_cluster.indices.exists(index="activity_"+delete_date) if exist_activity: es_cluster.indices.delete(index="activity_"+delete_date) redis_activity.delete('sensitive_activity_'+str(delete_ts)) if WORK_TYPE == 0: exist_activity = es_cluster.indices.exists(index="sensitive_activity_"+delete_date) if exist_activity: es_cluster.indices.delete(index="sensitive_activity_"+delete_date) #delete hashtag redis_cluster.delete('hashtag_'+str(delete_ts)) redis_cluster.delete('sensitive_hashtag_'+str(delete_ts)) #delete sensitive words redis_cluster.delete('sensitive_'+str(delete_ts)) #delete recommendation r.delete('recomment_'+str(delete_date)+"_influence") r.delete('recomment_'+str(delete_date)+"_sensitive") r.delete("identify_in_sensitive_" + str(delete_date)) r.delete("identify_in_influence_" + str(delete_date)))
def get_recomment_history(admin_user, now_date): results = set() now_ts = datetime2ts(now_date) for i in range(RECOMMEND_IN_AUTO_DATE, 0, -1): iter_date = ts2datetime(now_ts - i * DAY) submit_user_recomment = 'recoment_' + admin_user + '_' + str(iter_date) recomment_user_list = set(R_RECOMMENTATION.hkeys(submit_user_recomment)) results = results | recomment_user_list return results
def update_recommentation_compute(admin_user): status = False #step1: update lastest 6 day compute count end_ts = datetime2ts(ts2datetime(time.time() - DAY)) all_compute_set = set(R_RECOMMENTATION.hkeys('compute')) bulk_action = [] for i in range(1, 6): iter_ts = end_ts - i* DAY iter_date = ts2datetime(iter_ts) submit_recomment_key = 'recomment_' + admin_user + '_' + iter_date submit_recomment_set = set(R_RECOMMENTATION.hkeys(submit_recomment_key)) compute_count = len(submit_recomment_set & all_compute_set) user_results = {'compute_count': compute_count} action = {'update': {'_id': admin_user + '_' + str(iter_ts)}} bulk_action.extend([action, {'doc': user_results}]) #step2: update bulk action #print 'bulk_action:', bulk_action es_operation.bulk(bulk_action, index=operation_index_name, doc_type=operation_index_type) status = True return status
def recommentation_in(input_ts, recomment_type, submit_user): date = ts2datetime(input_ts) recomment_results = [] # read from redis results = [] hash_name = 'recomment_'+str(date) + "_" + recomment_type identify_in_hashname = "identify_in_" + str(date) submit_user_recomment = "recomment_" + submit_user + "_" + str(date) # 用户自推荐名单 results = r.hgetall(hash_name) if not results: return [] # search from user_profile to rich the show information recommend_list = set(r.hkeys(hash_name)) identify_in_list = set(r.hkeys("compute")) submit_user_recomment = set(r.hkeys(submit_user_recomment)) recomment_results = list(recommend_list - identify_in_list) recomment_results = list(set(recomment_results) - submit_user_recomment) if recomment_results: results = get_user_detail(date, recomment_results, 'show_in', recomment_type) else: results = [] return results
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') user_list = [] mapping_dict = dict() for uid in results: user_list = json.loads(results[uid]) print 'user_list:', user_list in_date = user_list[0] status = user_list[1] if status == '1': user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:2 computing print 'mapping_dict:', mapping_dict r.hmset('compute', mapping_dict) #acquire bulk user weibo data #user_weibo_dict = read_user_weibo(user_list) #compute text attribute #compute_status = compute2in(user_list, user_weibo_dict, status='insert') compute_status = False if compute_status==True: change_status_computed(mapping_dict)
def change_status_computed(mapping_dict): hash_name = 'compute' status = 4 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '4' in_date = user_list[0] new_mapping_dict[uid] = json.dumps(user_list) #revise identify_in_date influence_hashname = 'identify_in_influence_' + str(in_date) sensitive_hashname = 'identify_in_sensitive_' + str(in_date) manual_hashname = "identify_in_manual_" + str(in_date) tmp = r.hget(influence_hashname, uid) tmp1 = r.hget(sensitive_hashname, uid) if tmp: r.hset(influence_hashname, uid, '4') elif tmp1: r.hset(sensitive_hashname, uid, '4') else: r.hset(manual_hashname, uid, '4') r.hmset(hash_name, new_mapping_dict)
def main(): now_ts = time.time() delete_ts = datetime2ts(ts2datetime(now_ts-EXPIRE_TIME)) #待删除的时间戳 delete_date = ts2datetime(now_ts-EXPIRE_TIME) #delete @ r_cluster.delete("at_"+str(delete_ts)) #delete ip r_cluster.delete('new_ip_'+str(delete_ts)) #delete activity r_cluster.delete('activity_'+str(delete_ts)) #delete hashtag r_cluster.delete('hashtag_'+str(delete_ts)) #delete sensitive words r_cluster.delete('sensitive_'+str(delete_ts)) #delete recommendation r.delete('recomment_'+str(delete_date))
def change_status_computed(mapping_dict): hash_name = 'compute' status = 4 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '4' in_date = user_list[0] new_mapping_dict[uid] = json.dumps(user_list) #revise identify_in_date influence_hashname = 'identify_in_influence_'+str(in_date) sensitive_hashname = 'identify_in_sensitive_'+str(in_date) manual_hashname = "identify_in_manual_"+str(in_date) tmp = r.hget(influence_hashname, uid) tmp1 = r.hget(sensitive_hashname, uid) if tmp: r.hset(influence_hashname, uid, '4') elif tmp1: r.hset(sensitive_hashname, uid, '4') else: r.hset(manual_hashname, uid, '4') r.hmset(hash_name, new_mapping_dict)
def get_operate_information(): result = dict() now_ts = time.time() date = ts2datetime(now_ts - 24*3600) #test date = '2013-09-07' delete_date = ''.join(date.split('-')) #test #delete_date = '20150727' result['in_count'] = len(r_recomment.hkeys('recomment_'+str(date))) out_count_list = r_recomment.hget('recommend_delete_list', delete_date) #print 'out_count_list:', out_count_list if out_count_list: result['out_count'] = len(json.loads(out_count_list)) else: result['out_count'] = 0 compute_list = r_recomment.hkeys('compute') ''' if compute_list: result['compute'] = len(compute_list) ''' #print 'operate compute:', result return result
def update_recommentation_compute(admin_user): status = False #step1: update lastest 6 day compute count end_ts = datetime2ts(ts2datetime(time.time() - DAY)) all_compute_set = set(R_RECOMMENTATION.hkeys('compute')) bulk_action = [] for i in range(1, 6): iter_ts = end_ts - i * DAY iter_date = ts2datetime(iter_ts) submit_recomment_key = 'recomment_' + admin_user + '_' + iter_date submit_recomment_set = set( R_RECOMMENTATION.hkeys(submit_recomment_key)) compute_count = len(submit_recomment_set & all_compute_set) user_results = {'compute_count': compute_count} action = {'update': {'_id': admin_user + '_' + str(iter_ts)}} bulk_action.extend([action, {'doc': user_results}]) #step2: update bulk action #print 'bulk_action:', bulk_action es_operation.bulk(bulk_action, index=operation_index_name, doc_type=operation_index_type) status = True return status
def filter_rules(candidate_results): results = [] #rule1: activity count filter_result1 = filter_activity(candidate_results) #rule2: ip count filter_result2 = filter_ip(filter_result1) #rule3: retweet count & beretweeted count filter_result3 = filter_retweet_count(filter_result2) #rule4: mention count results = filter_mention(filter_result3) #rule5: compute count compute_uid_set = r.hkeys("compute") results = set(results) - set(compute_uid_set) return results
def scan_compute_redis_v2(): task_type = 'user' bulk_action = [] count = 0 iter_user_list = [] verified_mark_dict = dict() relation_mark_dict = dict() submit_user_dict = dict() submit_ts_dict = dict() while True: r_user_item = r.rpop(r_user_update_hash_name) #print 'r_user_item:', r_user_item if r_user_item: #print 'r_user_item:', r_user_item r_user_item = json.loads(r_user_item) uid = r_user_item[0] relation_mark = r_user_item[1] iter_user_list.append(uid) relation_mark_dict[uid] = relation_mark count += 1 else: break if count % 100 == 0 and count != 0: if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts, relation_mark_dict, task_type, submit_user_dict, submit_ts_dict) iter_user_list = [] relation_mark_dict = dict() if iter_user_list != []: if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts, relation_mark_dict, task_type, submit_user_dict, submit_ts_dict)
def get_operate_information(): result = dict() #run_type if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) date = ts2datetime(now_ts - DAY) delete_date = ''.join(date.split('-')) result['in_count'] = len(r_recomment.hkeys('recomment_' + str(date))) out_count_list = r_recomment.hget('recommend_delete_list', delete_date) if out_count_list: result['out_count'] = len(json.loads(out_count_list)) else: result['out_count'] = 0 compute_list = r_recomment.hkeys('compute') if compute_list: result['compute'] = len(compute_list) else: result['compute'] = 0 return result
def change_status_compute_fail(mapping_dict): hash_name = 'compute' status = 1 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '1' new_mapping_dict[uid] = json.dumps(user_list) in_date = user_list[0] #revise identify_in_date influence_hashname = 'identify_in_influence_'+str(in_date) sensitive_hashname = 'identify_in_sensitive_'+str(in_date) tmp = r.hget(influence_hashname, uid) if tmp: r.hset(influence_hashname, uid, '1') else: r.hset(sensitive_hashname, uid, '1') r.hmset(hash_name, new_mapping_dict)
def createWordTree(): awords = [] sensitive_words = r.hkeys('sensitive_words') #for b in open('./../../sensitive_words.txt', 'rb'): # awords.append(b.strip()) awords = sensitive_words for word in awords: temp = wordTree for a in range(0,len(word)): index = ord(word[a]) if a < (len(word) - 1): if temp[index] == None: node = [[None for x in range(256)],0] temp[index] = node elif temp[index] == 1: node = [[None for x in range(256)],1] temp[index] = node temp = temp[index][0] else: temp[index] = 1
def main(): #run_type if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) date = ts2datetime(now_ts - DAY) print date # auto recommendation: step 1:4 #step1: read from top es_daily_rank top_user_set, user_dict = search_from_es(date) #step2: filter black_uid black_user_set = read_black_user() subtract_user_set = top_user_set - black_user_set #step3: filter users have been in subtract_user_set = list(subtract_user_set) candidate_results = filter_in(subtract_user_set) #step4: filter rules about ip count& reposts/bereposts count&activity count results = filter_rules(candidate_results) new_date = ts2datetime(now_ts) hashname_influence = "recomment_" + new_date + "_influence" if results: for uid in results: r.hset(hashname_influence, uid, "0") #step5: get sensitive user print date,'date' sensitive_user = list(get_sensitive_user(date)) hashname_sensitive = "recomment_" + new_date + "_sensitive" if sensitive_user: for uid in sensitive_user: print uid, hashname_sensitive r.hset(hashname_sensitive, uid, "0") results.extend(sensitive_user) results = set(results) # step6: write to recommentation csv/redis ++for super admin hashname_submit = "submit_recomment_" + new_date if results: for uid in results: r.hset(hashname_submit, uid, json.dumps({"system":1, "operation":"system"}))
def main(): #run_type if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) date = ts2datetime(now_ts - DAY) # auto recommendation: step 1:4 #step1: read from top es_daily_rank top_user_set, user_dict = search_from_es(date) #step2: filter black_uid black_user_set = read_black_user() subtract_user_set = top_user_set - black_user_set #step3: filter users have been in subtract_user_set = list(subtract_user_set) candidate_results = filter_in(subtract_user_set) #step4: filter rules about ip count& reposts/bereposts count&activity count results = filter_rules(candidate_results) new_date = ts2datetime(now_ts) hashname_influence = "recomment_" + new_date + "_influence" if results: for uid in results: r.hset(hashname_influence, uid, "0") #step5: get sensitive user sensitive_user = list(get_sensitive_user(date)) hashname_sensitive = "recomment_" + new_date + "_sensitive" if sensitive_user: for uid in sensitive_user: r.hset(hashname_sensitive, uid, "0") results.extend(sensitive_user) results = set(results) #step6: write to recommentation csv/redis hashname_submit = "submit_recomment_" + new_date if results: for uid in results: r.hset(hashname_submit, uid, json.dumps({"system":1, "operation":"system"}))
def save_result(results): hash_name = 'overview' for item in results: r_recomment.hset(hash_name, item, results[item]) return True
sensitive_words = sensitive_dict.keys() else: sensitive_words = [] if sensitive_history_dict.get('fields',0): #print sensitive_history_dict['fields'][sensitive_string][0] #print top_sensitive sensitive_value = math.log(sensitive_history_dict['fields'][sensitive_string][0]/float(top_sensitive)*9+1, 10)*100 #print "sensitive_value", sensitive_value else: sensitive_value = 0 results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words, sensitive_value]) else: results.append([uid, uname, location, fansnum, statusnum, influence]) if auth: hashname_submit = "submit_recomment_" + date tmp_data = json.loads(r.hget(hashname_submit, uid)) recommend_list = (tmp_data['operation']).split('&') admin_list = [] admin_list.append(tmp_data['system']) admin_list.append(list(set(recommend_list))) admin_list.append(len(recommend_list)) return results def get_evaluate_max(index_name): max_result = {} index_type = 'bci' evaluate_index = ['user_index'] for evaluate in evaluate_index: query_body = { 'query':{ 'match_all':{}
# every 30 minutes reexecute this program import sys import time import json import redis from elasticsearch import Elasticsearch from text_attribute import compute_attribute reload(sys) sys.path.append('./../../') from global_utils import R_RECOMMENTATION as r from global_utils import es_sensitive_user_text as es_text from time_utils import datetime2ts, ts2datetime date = ts2datetime(time.time()).replace('-', '') temp = r.hget('compute_now', date) if temp: now_list = json.loads(temp) uid_list = [] count = 0 for item in now_list: uid_list.append(item[0]) user_weibo_dict = dict() # extract user weibo text compute_attribute(user_weibo_dict) for i in range(now_list): uid = now_list[i][0] source = now_list[i][1] if source == '1': r.hset('identify_in_sensitive_'+str(date), uid, '3') # finish comoute else:
def scan_compute_redis(): task_mark = 'user' hash_name = r_user_hash_name results = r.hgetall(hash_name) iter_user_list = [] mapping_dict = dict() verify_mark_dict = dict() relation_mark_dict = dict() submit_user_dict = dict() submit_ts_dict = dict() count = 0 for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] verify_mark = user_list[2] relation_list = user_list[3] submit_user = user_list[4] submit_ts = datetime2ts(in_date) verify_mark_dict[uid] = verify_mark relation_mark_dict[uid] = relation_list submit_user_dict[uid] = submit_user submit_ts_dict[uid] = submit_ts if status == '2': #imme #test count += 1 iter_user_list.append(uid) mapping_dict[uid] = json.dumps( [in_date, '3', verify_mark, relation_list, submit_user]) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: r.hmset(r_user_hash_name, mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts, relation_mark_dict, task_mark, submit_user_dict, submit_ts_dict) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #when uid user no weibo at latest week to change compute status to 1 if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set( user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([ in_date, '2', verify_mark_dict[change_user], relation_mark_dict[change_user], submit_user_dict[change_user], submit_ts_dict[change_user] ]) r.hmset(r_user_hash_name, change_mapping_dict) iter_user_list = [] mapping_dict = {} verify_mark_dict = dict() relation_mark_dict = dict() submit_user_dict = dict() submit_ts_dict = dict() if iter_user_list != [] and mapping_dict != {}: r.hmset(r_user_hash_name, mapping_dict) #acquire bulk user weibo date print 'iter_user_list:', len(iter_user_list) if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute print 'user_weibo_dict:', len(user_weibo_dict) compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts, relation_mark_dict, task_mark, submit_user_dict, submit_ts_dict) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #when uid user no weibo at latest week to change compute status to 1 if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set( user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([ in_date, '2', verify_mark_dict[change_user], relation_mark_dict[change_user], submit_user_dict[change_user], submit_ts_dict[change_user] ]) r.hmset(r_user_hash_name, change_mapping_dict)
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') iter_user_list = [] mapping_dict = dict() #test count = 0 for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == '1': #imme #test count += 1 iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: r.hmset('compute', mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status==True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #when uid user no weibo at latest week to change compute status to 1 if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set(user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([in_date, '1']) r.hmset('compute', change_mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset('compute', mapping_dict) #acquire bulk user weibo date print 'iter_user_list:', len(iter_user_list) if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(iter_user_list) #compute text attribute print 'user_weibo_dict:', len(user_weibo_dict) compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status==True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #when uid user no weibo at latest week to change compute status to 1 if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set(user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([in_date, '1']) r.hmset('compute', change_mapping_dict)
} result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)["hits"]["hits"] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid if __name__ == "__main__": now_date = ts2datetime(time.time()).replace('-', '') former_date = ts2datetime(time.time() - 7 * 24 * 3600).replace('-', '') r_recommend.hdel('recommend_sensitive', former_date) # delete 7 days ago recommentation uid_list r_recommend.hdel('recommend_influence', former_date) # delete 7 days ago recommentation uid_list now_date = '20130901' # test sensitive_weibo_uid = search_sensitive_weibo( now_date) # sensitive words uid list, direct recommend in top_influence_uid = search_top_k(now_date, 10000) # top influence uid list, filter # step 1: no sensitive user in top influence revise_influence_uid_list = set(top_influence_uid) - set( sensitive_weibo_uid) black_uid_list = read_black_user_list() revise_influence_uid_list = set(revise_influence_uid_list) - set( black_uid_list) print 'filter black list: ', len(revise_influence_uid_list)
"sort": [{"user_index": {"order": "desc"}}] } result = es_cluster.search(index=index_name,doc_type="bci", body=query_body)["hits"]["hits"] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid if __name__ == "__main__": now_date = ts2datetime(time.time()).replace('-','') former_date = ts2datetime(time.time()-7*24*3600).replace('-','') r_recommend.hdel('recommend_sensitive', former_date) # delete 7 days ago recommentation uid_list r_recommend.hdel('recommend_influence', former_date) # delete 7 days ago recommentation uid_list now_date = '20130901' # test sensitive_weibo_uid = search_sensitive_weibo(now_date) # sensitive words uid list, direct recommend in top_influence_uid = search_top_k(now_date, 10000) # top influence uid list, filter # step 1: no sensitive user in top influence revise_influence_uid_list = set(top_influence_uid) - set(sensitive_weibo_uid) black_uid_list = read_black_user_list() revise_influence_uid_list = set(revise_influence_uid_list) - set(black_uid_list) print 'filter black list: ', len(revise_influence_uid_list) #total = set(sensitive_weibo_uid) | set(top_influence_uid) # step 2: no recommending sensitive_uid_recommending_filter = filter_recommend(sensitive_weibo_uid) top_influence_recommending_filter = filter_recommend(revise_influence_uid_list) # step 3: no one in portrait
def save_recommentation2redis(date, user_set): hash_name = 'recomment_' + str(date) status = 0 for uid in user_set: r.hset(hash_name, uid, status) return True
def save_recommentation2redis(date, user_set): hash_name = 'recomment_'+str(date) status = 0 for uid in user_set: r.hset(hash_name, uid, status) return True
def get_attr(date): results = dict() total_number = es.count(index="sensitive_user_portrait", doc_type="user")['count'] results['total_number'] = total_number max_result = get_evaluate_max() query_body = { "query": { "filtered": { "filter": { "term": { "sensitive": 0 } } } } } influence_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count'] results['sensitive_number'] = total_number - influence_number results['influence_number'] = influence_number # 政治倾向性统计 query_body = query_body_module('politics') politic_array = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] politic_dict = dict() for item in politic_array: politic_dict[item['key']] = item['doc_count'] results['politics'] = politic_dict # 入库推荐人数 recommend_in_sensitive = 0 recommend_in_sensitive = r.hlen("recomment_" + date + 'sensitive') recommend_in_influence = 0 recommend_in_influence = r.hlen("recomment_" + date + "_influence") results['recommend_in'] = recommend_in_influence + recommend_in_sensitive # 群体分析任务 results['monitor_number'] = [4, 83] # test query_body = { "query": { "bool": { "must": [{ "term": { 'task_type': "detect" } }, { "term": { "state": 0 } }] } } } group_detect_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] query_body = { "query": { "bool": { "must": [{ "term": { 'task_type': "analysis" } }, { "term": { "state": 0 } }] } } } group_analysis_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] results["group_detect_number"] = group_detect_number results["group_analysis_number"] = group_analysis_number # 敏感词 query_body = query_body_module('sensitive_words_string') sw_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_words = [] for item in sw_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_words.append(temp) results['sensitive_words'] = sensitive_words query_body = query_body_module('keywords_string') sg_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['keywords_string'] = sensitive_geo query_body = query_body_module('sensitive_hashtag_string') sh_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_hashtag = [] for item in sh_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_hashtag.append(temp) results['sensitive_hashtag'] = sensitive_hashtag query_body = query_body_module('sensitive_activity_geo_aggs') sg_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo ''' query_body = query_body_module('domain_string') sd_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] domain = [] for item in sd_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) domain.append(temp) results['domain'] = domain ''' # tendency distribution # domain and topic rank_results = get_top_user() results.update(rank_results) # rank results['importance'] = search_in_portrait('importance', max_result) results['sensitive'] = search_in_portrait('sensitive', max_result) results['influence'] = search_in_portrait('influence', max_result) results['activeness'] = search_in_portrait('activeness', max_result) # 敏感微博转发量和评论量 mid_list = get_top_mid() sensitive_hot_retweet = sort_retweet_sensitive_weibo(mid_list) sensitive_hot_comment = sort_comment_sensitive_weibo(mid_list) sensitive_weibo_text = get_weibo_detail(mid_list) results['sensitive_hot_retweet'] = sensitive_hot_retweet results['sensitive_hot_comment'] = sensitive_hot_comment results['sensitive_weibo_text'] = sensitive_weibo_text r.set('overview', json.dumps(results))
def save_recommentation2redis(date, user_set): hash_name = 'recommend' date = date.replace('-','') if user_set: r.hset(hash_name, date, json.dumps(list(user_set))) return 1
def get_attr(date): results = dict() total_number = es.count(index="sensitive_user_portrait", doc_type="user")['count'] results['total_number'] = total_number max_result = get_evaluate_max() query_body={ "query":{ "filtered":{ "filter":{ "term":{ "sensitive": 0 } } } } } influence_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count'] results['sensitive_number'] = total_number - influence_number results['influence_number'] = influence_number # 政治倾向性统计 query_body = query_body_module('politics') politic_array = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] politic_dict = dict() for item in politic_array: politic_dict[item['key']] = item['doc_count'] results['politics'] = politic_dict # 入库推荐人数 recommend_in_sensitive = 0 recommend_in_sensitive = r.hlen("recomment_" + date +'sensitive') recommend_in_influence = 0 recommend_in_influence = r.hlen("recomment_" + date + "_influence") results['recommend_in'] = recommend_in_influence + recommend_in_sensitive # 群体分析任务 results['monitor_number'] = [4, 83] # test query_body = { "query":{ "bool":{ "must":[ {"term":{'task_type':"detect"}}, {"term":{"state":0}} ] } } } group_detect_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] query_body = { "query":{ "bool":{ "must":[ {"term":{'task_type':"analysis"}}, {"term":{"state":0}} ] } } } group_analysis_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] results["group_detect_number"] = group_detect_number results["group_analysis_number"] = group_analysis_number # 敏感词 query_body = query_body_module('sensitive_words_string') sw_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_words = [] for item in sw_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_words.append(temp) results['sensitive_words'] = sensitive_words query_body = query_body_module('keywords_string') sg_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['keywords_string'] = sensitive_geo query_body = query_body_module('sensitive_hashtag_string') sh_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_hashtag = [] for item in sh_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_hashtag.append(temp) results['sensitive_hashtag'] = sensitive_hashtag query_body = query_body_module('sensitive_activity_geo_aggs') sg_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo ''' query_body = query_body_module('domain_string') sd_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] domain = [] for item in sd_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) domain.append(temp) results['domain'] = domain ''' # tendency distribution # domain and topic rank_results = get_top_user() results.update(rank_results) # rank results['importance'] = search_in_portrait('importance', max_result) results['sensitive'] = search_in_portrait('sensitive', max_result) results['influence'] = search_in_portrait('influence', max_result) results['activeness'] = search_in_portrait('activeness', max_result) # 敏感微博转发量和评论量 mid_list = get_top_mid() sensitive_hot_retweet = sort_retweet_sensitive_weibo(mid_list) sensitive_hot_comment = sort_comment_sensitive_weibo(mid_list) sensitive_weibo_text = get_weibo_detail(mid_list) results['sensitive_hot_retweet'] = sensitive_hot_retweet results['sensitive_hot_comment'] = sensitive_hot_comment results['sensitive_weibo_text'] = sensitive_weibo_text r.set('overview', json.dumps(results))
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') iter_user_list = [] mapping_dict = dict() #test count = 0 for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == '1': #imme #test count += 1 iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:3 computing if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: r.hmset('compute', mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #when uid user no weibo at latest week to change compute status to 1 if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set( user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps( [in_date, '1']) r.hmset('compute', change_mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset('compute', mapping_dict) #acquire bulk user weibo date print 'iter_user_list:', len(iter_user_list) if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute print 'user_weibo_dict:', len(user_weibo_dict) compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #when uid user no weibo at latest week to change compute status to 1 if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set( user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([in_date, '1']) r.hmset('compute', change_mapping_dict)
import sys import time import json import redis from elasticsearch import Elasticsearch from text_attribute import compute_attribute reload(sys) sys.path.append('./../../') from global_utils import R_RECOMMENTATION as r from global_utils import es_sensitive_user_text as es_text from time_utils import datetime2ts, ts2datetime date = ts2datetime(time.time()-24*3600).replace('-', '') temp = r.hget('compute_appoint', date) if temp: now_list = json.loads(temp) uid_list = [] count = 0 for item in now_list: uid_list.append(item[0]) user_weibo_dict = dict() # extract user weibo text compute_attribute(user_weibo_dict) for i in range(now_list): uid = now_list[i][0] source = now_list[i][1] if source == '1': r.hset('identify_in_sensitive_'+str(date), uid, '3') # finish comoute else:
def scan_compute_redis(): hash_name = 'compute' results = r.hgetall('compute') iter_user_list = [] mapping_dict = dict() for uid in results: user_list = json.loads(results[uid]) in_date = user_list[0] status = user_list[1] if status == '2': iter_user_list.append(uid) mapping_dict[uid] = json.dumps([in_date, '3']) # mark status:3 computing #revise identify_in_date influence_hashname = 'identify_in_influence_' + str(in_date) sensitive_hashname = 'identify_in_sensitive_' + str(in_date) manual_hashname = "identify_in_manual_" + str(in_date) tmp = r.hget(influence_hashname, uid) tmp1 = r.hget(sensitive_hashname, uid) if tmp: r.hset(influence_hashname, uid, '3') elif tmp1: r.hset(sensitive_hashname, uid, '3') else: r.hset(manual_hashname, uid, '3') if len(iter_user_list) % 100 == 0 and len(iter_user_list) != 0: #mark status from 1 to 3 as identify_compute to computing r.hmset('compute', mapping_dict) #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2( user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #deal user no weibo to compute portrait attribute if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set( user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps( [in_date, '2']) r.hmset(change_mapping_dict) iter_user_list = [] mapping_dict = {} if iter_user_list != [] and mapping_dict != {}: r.hmset('compute', mapping_dict) #acquire bulk user weibo date if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment( iter_user_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text( iter_user_list) #compute text attribute compute_status = test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts) if compute_status == True: change_status_computed(mapping_dict) else: change_status_compute_fail(mapping_dict) #deal user no weibo to compute portrait attribute if len(user_keywords_dict) != len(iter_user_list): change_mapping_dict = dict() change_user_list = set(iter_user_list) - set( user_keywords_dict.keys()) for change_user in change_user_list: change_mapping_dict[change_user] = json.dumps([in_date, '2']) r.hmset(change_mapping_dict)
def save_recommentation2redis(date, user_set): hash_name = 'recommend' date = date.replace('-', '') if user_set: r.hset(hash_name, date, json.dumps(list(user_set))) return 1
# every 30 minutes reexecute this program import sys import time import json import redis from elasticsearch import Elasticsearch from text_attribute import compute_attribute reload(sys) sys.path.append('./../../') from global_utils import R_RECOMMENTATION as r from global_utils import es_sensitive_user_text as es_text from time_utils import datetime2ts, ts2datetime date = ts2datetime(time.time()).replace('-', '') temp = r.hget('compute_now', date) if temp: now_list = json.loads(temp) uid_list = [] count = 0 for item in now_list: uid_list.append(item[0]) user_weibo_dict = dict() # extract user weibo text compute_attribute(user_weibo_dict) for i in range(now_list): uid = now_list[i][0] source = now_list[i][1] if source == '1': r.hset('identify_in_sensitive_' + str(date), uid, '3') # finish comoute