def filter_retweet_count(user_set): FILTER_ITER_COUNT = 100 results = [] now_ts = time.time() db_number = get_db_num(now_ts) retweet_index_name = retweet_index_name_pre + str(db_number) # test search_user_count = len(user_set) iter_search_count = 0 while iter_search_count < search_user_count: iter_search_user_list = user_set[iter_search_count:iter_search_count + FILTER_ITER_COUNT] try: retweet_result = es_retweet.mget(index = retweet_index_name, doc_type = retweet_index_type,\ body = {'ids':iter_search_user_list}, _source=True)['docs'] except: retweet_result = [] for retweet_item in retweet_result: if retweet_item['found']: retweet_set = set() user = retweet_item['_id'] per_retweet_result = json.loads( retweet_item['_source']['uid_retweet']) for retweet_user in per_retweet_result: retweet_set.add(retweet_user) if len(retweet_set) < retweet_threshold: results.append(user) else: writer.writerow([user, 'retweet']) else: user = retweet_item['_id'] results.append(user) iter_search_count += FILTER_ITER_COUNT return results
def filter_retweet_count(user_set): FILTER_ITER_COUNT = 100; results = [] now_ts = time.time() db_number = get_db_num(now_ts) retweet_index_name = retweet_index_name_pre + str(db_number) # test search_user_count = len(user_set); iter_search_count = 0 while iter_search_count < search_user_count: iter_search_user_list = user_set[iter_search_count:iter_search_count + FILTER_ITER_COUNT] try: retweet_result = es_retweet.mget(index = retweet_index_name, doc_type = retweet_index_type,\ body = {'ids':iter_search_user_list}, _source=True)['docs'] except: retweet_result = [] if retweet_result: for retweet_item in retweet_result: if retweet_item['found']: retweet_set = set() user = retweet_item['_id'] per_retweet_result = json.loads(retweet_item['_source']['uid_retweet']) for retweet_user in per_retweet_result: retweet_set.add(retweet_user) if len(retweet_set) < retweet_threshold: results.append(user) else: writer.writerow([user, 'retweet']) else: user = retweet_item['_id'] results.append(user) iter_search_count += FILTER_ITER_COUNT return results
def extend_network(task_name, ts): index_name = task_name # mu qian can yu de yonghu shu query_uid = { "query":{ "filtered":{ "filter":{ "range":{ "timestamp":{ "lt": ts } } } } }, "aggs":{ "uid_count":{"cardinality":{"field": "uid"}} } } uid_count = es_prediction.search(index=index_name, doc_type="text", \ body=query_uid)["aggregations"]["uid_count"]["value"] try: extend_retweet_threshold = float(r_stimulation.get("extend_retweet_threshold")) except: r_stimulation.set("extend_retweet_threshold", 10000) extend_retweet_threshold = 10000 user_list = organize_network(task_name, ts) exist_user_set = set(user_list) in_user_list = list() ####已存在的用户列表 in_user_info = [] count = 0 all_user_dict = dict() ## participate user >>> extended list list_len = len(user_list) len_1000 = list_len/1000 for i in range(len_1000+1): tmp_uid = user_list[i*1000: (i+1)*1000] es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"] for item in es_results: if item["found"]: count +=1 if count % 1000 == 0: print "extend network: ", count uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) retweet_count = len(uid_be_retweet) if retweet_count < extend_retweet_threshold: # 对外扩展的阈值 continue uid_retweet_list = uid_be_retweet.keys() uid_retweet_list = list(set(uid_retweet_list)-exist_user_set) all_user_dict[item["_id"]] = uid_retweet_list # 扩展的用户 retweet_count = len(uid_be_retweet) in_user_list.append(item["_id"]) in_user_info.append([math.log(retweet_count+1), math.log(uid_count+1)]) return uid_count,in_user_list, in_user_info, all_user_dict
def extend_network(task_name): file_name = task_name + ".txt" f = open(task_name + ".txt", "w") line = 0 user2number_dict = dict() # mapping: number-uid number2user_dict = dict() count = 0 user_list = organize_network(task_name) list_len = len(user_list) len_1000 = list_len / 1000 for i in range(len_1000 + 1): tmp_uid = user_list[i * 1000:(i + 1) * 1000] es_results = es_retweet.mget(index=index_be_retweet, doc_type=index_type_be_retweet, body={"ids": tmp_uid})["docs"] for item in es_results: if item["found"]: print count uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) be_retweet_list = uid_be_retweet.keys() uid = item["_id"] if user2number_dict.has_key(uid): uid_count = user2number_dict[uid] else: count += 1 uid_count = count user2number_dict[uid] = count number2user_dict[count] = uid for each in be_retweet_list: if user2number_dict.has_key(each): each_number = user2number_dict[each] else: count += 1 user2number_dict[each] = count number2user_dict[count] = uid each_number = count if each_number != uid_count: f.write(str(uid_count) + " " + str(each_number) + "\n") line += 1 f.close() cmd = 'sed -i "" -e "1i %s %s" %s' % (count, line, file_name) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) es_prediction.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"network_exist": "1"}}) print "finish: ", count file_user = open("user_" + task_name + ".txt", "w") for uid in user2number_dict.keys(): file_user.write(str(uid) + '\n')
def extend_network(task_name): # mu qian can yu de yonghu shu query_uid = { "query": { "range": { "timestamp": { "lt": ts } } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } uid_count = es.search(index=index_name, doc_type="text", \ body=query_uid)["aggregations"]["uid_count"]["value"] file_name = task_name + ".txt" f = open(task_name + ".txt", "w") line = 0 count = 0 user_list = organize_network(task_name) important_user_list = list() #### important_user_info = [] list_len = len(user_list) len_1000 = list_len / 1000 for i in range(len_1000 + 1): tmp_uid = user_list[i * 1000:(i + 1) * 1000] es_results = es_retweet.mget(index=index_be_retweet, doc_type=index_type_be_retweet, body={"ids": tmp_uid})["docs"] for item in es_results: if item["found"]: print count uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) retweet_count = len(uid_be_retweet) if retweet_count < 100: continue important_user_list.append(item["_id"]) important_user_info.append([retweet_count, uid_count]) print "finish: ", count
def get_extend(all_set): extend_result = set() retweet_comment_dict_list = [] #step0: random get user user_count = len(all_set) all_user_list = list(all_set) if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list): silce = all_user_list else: silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE) db_number = get_db_num() #step1: get retweet retweet_index_name = retweet_index_name_pre + str(db_number) try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\ body={'ids': silce})['docs'] except: retweet_result = [] #step1.2: get uid retweet for retweet_item in retweet_result: try: if retweet_item['found'] == True: uid_retweet_dict = retweet_item['_source']['uid_retweet'] retweet_comment_dict_list.append(json.loads(uid_retweet_dict)) except: pass #step2: get comment comment_index_name = comment_index_name_pre + str(db_number) try: comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\ body={'ids': silce})['docs'] except: comment_result = [] #step2.2: get uid commnt for comment_item in comment_result: try: if comment_item['found'] == True: retweet_comment_dict_list.append( json.loads(comment_item['_source']['uid_comment'])) except: pass #step3: union dict list union_retweet_comment_list = union_dict(retweet_comment_dict_list) #step4: filter in user portrait extend_result = filter_out(union_retweet_comment_list.keys()) return extend_result
def get_extend(all_set): extend_result = set() retweet_comment_dict_list = [] #step0: random get user user_count = len(all_set) all_user_list = list(all_set) if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list): silce = all_user_list else: silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE) db_number = get_db_num() #step1: get retweet retweet_index_name = retweet_index_name_pre + str(db_number) try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\ body={'ids': silce})['docs'] except: retweet_result = [] #step1.2: get uid retweet for retweet_item in retweet_result: try: if retweet_item['found']==True: uid_retweet_dict = retweet_item['_source']['uid_retweet'] retweet_comment_dict_list.append(json.loads(uid_retweet_dict)) except: pass #step2: get comment comment_index_name = comment_index_name_pre + str(db_number) try: comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\ body={'ids': silce})['docs'] except: comment_result = [] #step2.2: get uid commnt for comment_item in comment_result: try: if comment_item['found'] == True: retweet_comment_dict_list.append(json.loads(comment_item['_source']['uid_comment'])) except: pass #step3: union dict list union_retweet_comment_list = union_dict(retweet_comment_dict_list) #step4: filter in user portrait extend_result = filter_out(union_retweet_comment_list.keys()) return extend_result
def get_friends_list(recommend_set_list): friend_list = [] if len(recommend_set_list) == 0: return friend_list now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #print 'db_number...',db_number search_result = es_retweet.mget(index=be_retweet_index_name_pre+str(db_number), doc_type=be_retweet_index_type, body={"ids": recommend_set_list})["docs"] for item in search_result: uid = item['_id'] if not item['found']: continue else: data = item['_source']['uid_be_retweet'] data = eval(data) friend_list.extend(data.keys()) return friend_list[:500]
def get_community_coreuser_socail(uid_list, timestamp): uid2uname = get_user_name(uid_list) result = {} #step1: get db number # timestamp = int(time.time()) db_num = get_db_num(timestamp) retweet_index_name = retweet_index_name_pre + str(db_num) be_retweet_index_name = be_retweet_index_name_pre + str(db_num) comment_index_name = comment_index_name_pre + str(db_num) be_comment_index_name = be_comment_index_name_pre + str(db_num) #step2: split uid list to iter mget iter_count = 0 all_user_count = len(uid_list) in_stat_results = dict() out_stat_result = dict() all_in_record = [] all_out_record = [] all_out_user_count = 0 all_out_in_usr_count = 0 while iter_count < all_user_count: # iter_uid_list = uid_list iter_uid_list = uid_list[iter_count:iter_count + GROUP_ITER_COUNT] #step3:mget retweet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':iter_uid_list})['docs'] except: retweet_result = [] retweet_dict = {} #{uid1: {ruid1:count1, ruid2:count2}, uid2:{},...} for item in retweet_result: uid = item['_id'] #tesit for error es try: if item['found'] == True: retweet_dict[uid] = json.loads( item['_source']['uid_retweet']) except: pass #step4:mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':iter_uid_list})['docs'] except: comment_result = [] comment_dict = {} #{uid1:{ruid1:count1, ruid2:count2},...} # print 'comment_result:',comment_result for item in comment_result: uid = item['_id'] try: if item['found'] == True: comment_dict[uid] = json.loads( item['_source']['uid_comment']) except: pass #step5:mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \ body={'ids':iter_uid_list})['docs'] except: be_retweet_result = [] be_retweet_dict = dict() #{uid1: {uid_be_retweet dict}, uid2:{},...} for item in be_retweet_result: uid = item['_id'] #test for error es try: if item['found'] == True: be_retweet_dict[uid] = json.loads( item['_source']['uid_be_retweet']) except: pass #step6:mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type,\ body={'ids':iter_uid_list})['docs'] except: be_comment_result = [] be_comment_dict = dict() #{uid1:{uid_be_comment dict}, uid2:{},...} for item in be_comment_result: uid = item['_id'] #test for error es try: if item['found'] == True: be_comment_dict[uid] = json.loads( item['_source']['uid_be_comment']) except: pass #step7:union retweet&comment, be_retweet&be_comment for iter_uid in iter_uid_list: try: user_retweet_result = retweet_dict[iter_uid] except: user_retweet_result = {} try: user_comment_result = comment_dict[iter_uid] except: user_comment_result = {} filter_in_dict, filter_out_dict = filter_union_dict( [user_retweet_result, user_comment_result], uid_list, 'in&out') #step8: record the retweet/coment relaton in group uid # # uid_in_record = [[iter_uid, ruid, filter_in_dict[ruid], uid2uname[iter_uid], uid2uname[ruid]] for ruid in filter_in_dict if iter_uid != ruid] # # print 'filter_in_dict:',filter_in_dict # print 'filter_out_dict:',filter_out_dict # uid_in_record = [[iter_uid,uid2uname[iter_uid],ruid,uid2uname[ruid],filter_in_dict[ruid]] for ruid in filter_in_dict if iter_uid != ruid] uid_in_record = [] for ruid in filter_in_dict: # print 'ruid:',ruid item_list = [] if iter_uid != ruid: # print 'aaaa' item_list.append(iter_uid) if uid2uname.has_key(iter_uid): iter_name = uid2uname[iter_uid] else: iter_name = iter_uid item_list.append(iter_name) item_list.append(ruid) if uid2uname.has_key(ruid): ruid_name = uid2uname[ruid] else: ruid_name = ruid item_list.append(ruid_name) item_list.append(filter_in_dict[ruid]) if item_list: uid_in_record.append(item_list) else: pass else: pass # print 'item_list:',item_list # print 'uid_in_record:',uid_in_record # in_record = [uid_in_record] all_in_record.extend( uid_in_record ) # [[uid1, ruid1, count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...] #step9: record the retweet/comment/be_retweet/be_comment relation out group uid try: user_be_retweet_result = be_retweet_dict[iter_uid] except: user_be_retweet_result = {} try: user_be_comment_result = be_comment_dict[iter_uid] except: user_be_comment_result = {} filter_out_dict = filter_union_dict([ filter_out_dict, user_be_retweet_result, user_be_comment_result ], uid_list, 'out') #step10: filter out user who is in user_portrait # uid_out_record = [[iter_uid,get_user_nickname(iter_uid),ruid,get_user_nickname(ruid),filter_out_dict[ruid]] for ruid in filter_out_dict if iter_uid != ruid] uid_out_record = [] for ruid in filter_out_dict: item_list = [] if iter_uid != ruid: item_list.append(iter_uid) if uid2uname.has_key(iter_uid): iter_name = uid2uname[iter_uid] else: iter_name = iter_uid item_list.append(iter_name) item_list.append(ruid) if uid2uname.has_key(ruid): ruid_name = uid2uname[ruid] else: ruid_name = ruid item_list.append(ruid_name) item_list.append(filter_out_dict[ruid]) if item_list: uid_out_record.append(item_list) else: pass else: pass # out_record = [uid_out_record] all_out_record.extend( uid_out_record ) #[[uid1, ruid1,count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...] iter_count += GROUP_ITER_COUNT #step11 sort interaction in group by retweet&comment count sort_in_record = sorted(all_in_record, key=lambda x: x[4], reverse=True) # print 'all_out_record::',all_out_record sort_out_record = sorted(all_out_record, key=lambda x: x[4], reverse=True) # print 'sort_in_record::',len(sort_in_record) # print 'sort_out_record::',len(sort_out_record) #core_uidlist,outer_uidlist,community_dict['core_user_socail'],community_dict['core_outer_socail'] core_user_socail = [item for item in sort_in_record if item[4] > 2] core_uidlist = list(set([item[0] for item in core_user_socail])) # print 'core_uidlist:',core_uidlist # print 'core_user_socail:',core_user_socail core_outer_socail_temp = [ item for item in sort_out_record if (len(list(set(item[2].split()) & set(core_uidlist))) > 0 or len(list(set(item[0].split()) & set(core_uidlist))) > 0) and item[4] > 10 ] # core_outer_socail = [item for item in sort_out_record if (len(list(set(item[2])&set(core_uidlist)))>0 or len(list(set(item[0].split())&set(core_uidlist)))>0)] core_outer_socail = sorted(core_outer_socail_temp, key=lambda x: x[4], reverse=True)[0:30] # print 'core_outer_socail::',core_outer_socail outer_uidlist = [item[0] for item in core_outer_socail] # print 'core_user_socail:',type(core_user_socail) # print 'core_outer_socail:',type(core_outer_socail) # print 'core_uidlist::',type(core_uidlist) # print 'outer_uidlist::',type(outer_uidlist) # return json.loads(core_uidlist),json.loads(outer_uidlist),json.loads(core_user_socail),json.loads(core_outer_socail) return core_uidlist, outer_uidlist, core_user_socail, core_outer_socail
def detect_by_seed_users(seed_users): retweet_mark == 1 comment_mark == 1 group_uid_list = [] all_union_result_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #step1: mget retweet and be_retweet if retweet_mark == 1: retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: retweet_result = [] #mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \ body={'ids':seed_users} ,_source=True)['docs'] except: be_retweet_result = [] #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: be_comment_result = [] #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in seed_users: try: uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) all_union_result_dict[iter_search_uid] = union_result ''' !!!! 有一个转化提取 从 all_union_result_dict 中提取 所有的uid ''' return group_uid_list
def get_structure_user(seed_uid_list, structure_dict, filter_dict): structure_user_dict = {} retweet_mark = int(structure_dict['retweet']) comment_mark = int(structure_dict['comment']) hop = int(structure_dict['hop']) retweet_user_dict = {} comment_user_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #iter to find seed uid list retweet/be_retweet/comment/be_comment user set by hop iter_hop_user_list = seed_uid_list iter_count = 0 all_union_result = dict() while iter_count < hop: # hop number control iter_count += 1 search_user_count = len(iter_hop_user_list) hop_union_result = dict() iter_search_count = 0 while iter_search_count < search_user_count: iter_search_user_list = iter_hop_user_list[iter_search_count: iter_search_count + DETECT_ITER_COUNT] #step1: mget retweet and be_retweet if retweet_mark == 1: retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: retweet_result = [] #mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \ body={'ids':iter_search_user_list} ,_source=True)['docs'] except: be_retweet_result = [] #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: be_comment_result = [] #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in iter_search_user_list: try: uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) hop_union_result = union_dict(hop_union_result, union_result) #step4: add iter search count iter_search_count += DETECT_ITER_COUNT #pop seed uid self for iter_hop_user_item in iter_hop_user_list: try: hop_union_result.pop(iter_hop_user_item) except: pass #get new iter_hop_user_list iter_hop_user_list = hop_union_result.keys() #get all union result all_union_result = union_dict(all_union_result, hop_union_result) #step5: identify the who is in user_portrait sort_all_union_result = sorted(all_union_result.items(), key=lambda x:x[1], reverse=True) iter_count = 0 all_count = len(sort_all_union_result) in_portrait_result = [] filter_importance_from = filter_dict['importance']['gte'] filter_importance_to = filter_dict['importance']['lt'] filter_influence_from = filter_dict['influence']['gte'] filter_influence_to = filter_dict['influence']['lt'] while iter_count < all_count: iter_user_list = [item[0] for item in sort_all_union_result[iter_count:iter_count + DETECT_ITER_COUNT]] try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':iter_user_list}, _source=True)['docs'] except: portrait_result = [] for portrait_item in portrait_result: if portrait_item['found'] == True: if portrait_item['_source']['importance'] >= filter_importance_from and portrait_item['_source']['importance'] <= filter_importance_to: if portrait_item['_source']['influence'] >= filter_influence_from and portrait_item['_source']['influence'] <= filter_influence_to: uid = portrait_item['_id'] in_portrait_result.append(uid) if len(in_portrait_result) > (filter_dict['count'] * DETECT_COUNT_EXPAND): break iter_count += DETECT_ITER_COUNT return in_portrait_result
def predict_user_influence(task_name, stop_time, ts): future_total = 0 # 未来传播总量 current_total = 0 # 可控范围 uid_count, in_user_list, in_user_info, all_user_dict = extend_network(task_name, ts) with open("gbdt.pkl", "r") as f: gbdt = pickle.load(f) # 已出现的重要用户阈值 try: in_user_threshold = float(r_stimulation.get("in_user_threshold")) except: r_stimulation.set("in_user_threshold", 1000) in_user_threshold = 1000 in_results = gbdt.predict(in_user_info) print "len(in_user_list): ", len(in_user_list) prediction_in = dict() for i in range(len(in_user_list)): if math.exp(in_results[i]) > in_user_threshold: # 1000 prediction_in[in_user_list[i]] = math.exp(in_results[i]) future_dict = dict() count = 0 for k,v in all_user_dict.iteritems(): uid = k print "k: ", k print "v: ", len(v) tmp_prediction_list = [] # tmp storage tmp_uid_list = [] if 1: user_list = v list_len = len(user_list) len_1000 = list_len/1000 for i in range(len_1000+1): tmp_uid = user_list[i*1000: (i+1)*1000] if not tmp_uid: continue es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"] for item in es_results: if item["found"]: count += 1 uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) retweet_count = len(uid_be_retweet) if retweet_count < 1000: continue tmp = [] tmp.append(math.log(retweet_count+1)) tmp.append(math.log(uid_count+1)) tmp_prediction_list.append(tmp) tmp_uid_list.append(item["_id"]) if count % 1000 == 0: iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict) future_dict = iter_prediction_list tmp_prediction_list = [] tmp_uid_list = [] future_total += t1 current_total += t2 print "iter prediction: ", count if tmp_prediction_list: iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict) future_dict = iter_prediction_list future_total += t1 current_total += t2 print "future_dict: ", future_dict # storage save_results(task_name, ts, prediction_in, future_dict) # do left things dispose_results(task_name, ts, future_total, current_total) # update processing state es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"stimulation_processing_status":"0", "update_time": ts, "scan_text_finish":"0"}}) # stop task if ts >= stop_time: es_prediction.update(index=index_manage_interfere_task,doc_type=\ type_manage_interfere_task,id=task_name,body={"doc":{"finish":"1"}})