def scan_mapper(pre, sen_pre, r): if RUN_TYPE: ts = datetime2ts(ts2datetime(time.time - DAY)) else: ts = datetime2ts('2013-09-01') ts = str(ts) hash_name = pre + ts sen_hash_name = sen_pre + ts cursor = 0 count = 0 tb = time.time() while 1: re_scan = r.hscan(hash_name, cursor, count=1000) cursor = re_scan[0] ip_dict = re_scan[1] uid_list = ip_dict.keys() if uid_list: r.lpush('act_uid_list', json.dumps(uid_list)) count += len(uid_list) ts = time.time() print '%s : %s' %(count, ts - tb) tb = ts if cursor == 0: print count break
def scan_mapper(): if RUN_TYPE: ts = datetime2ts(ts2datetime(time.time - DAY)) else: ts = datetime2ts('2016-05-14') ts = str(ts) hash_name = sen_pre_ip + ts cursor = 0 count = 0 tb = time.time() while 1: re_scan = redis_ip.hscan(hash_name, cursor, count=1000) cursor = re_scan[0] ip_dict = re_scan[1] uid_list = ip_dict.keys() if uid_list: redis_ip.lpush('sensitive_ip_uid_list', json.dumps(uid_list)) count += len(uid_list) ts = time.time() print '%s : %s' %(count, ts - tb) tb = ts if cursor == 0: print count break
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 date = ts2datetime(ts) timestamp = datetime2ts(date) ts = ts.replace('-','') for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: item_dict = json.loads(result) sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True) if sorted_dict[0][1] > activity_threshold: over_count = 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity: ', len(results) return results
def weibo_sort_interface(username , time, sort_scope, sort_norm, arg, st, et, task_number, number): task_number = int(task_number) print "user_interface:", number weibo_list = [] during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 query_body = { "query":{ "terms":{ "status": [0, -1] } } } if sort_scope == 'all_limit_keyword': running_number = es_weibo_portrait.count(index=WEIBO_RANK_KEYWORD_TASK_INDEX, doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE, body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, type="keyword", during=during, st=st, et=et, arg=arg, sort_norm=sort_norm, sort_scope=sort_scope, time=time, number=number) #deal with the offline task return {"flag": True , "search_id": search_id} elif sort_scope == 'all_nolimit': pass return weibo_list
def main(): RUN_TYPE = 0 if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) now_ts = datetime2ts('2013-09-02') date = ts2datetime(now_ts - DAY) # auto recommendation: step 1:4 #step1: read from top es_daily_rank top_user_set, user_dict = search_from_es(date) #step2: filter black_uid black_user_set = read_black_user() subtract_user_set = top_user_set - black_user_set #step3: filter users have been in subtract_user_set = list(subtract_user_set) candidate_results = filter_in(subtract_user_set) #step4: filter rules about ip count& reposts/bereposts count&activity count results = filter_rules(candidate_results) #step5: get sensitive user sensitive_user = list(get_sensitive_user(date)) results = results - set(sensitive_user) # influence user - sensitive user new_date = ts2datetime(now_ts) hashname_influence = "recomment_" + new_date + "_influence" if results: for uid in results: #print uid r.hset(hashname_influence, uid, "0") hashname_sensitive = "recomment_" + new_date + "_sensitive" if sensitive_user: for uid in sensitive_user: #print "sensitive" r.hset(hashname_sensitive, uid, "0") """
def update_day_sensitive(uid_list): results = {} for uid in uid_list: results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-03") today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list) for item in sensitive_results: for uid, words_dict in item.iteritems(): sensitive_index = 0 sensitive_words_dict = {} if words_dict: sensitive_words_dict = json.dumps(words_dict) for word, count in words_dict.iter_items(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[tmp[0]] * count sensitive_words_string = "&".join(sensitive_words_dict.keys()) results[uid] = { "sensitive": sensitive_index, "sensitive_words_string": sensitive_words_string, "sensitive_words_dict": sensitive_words_dict, } return results
def key_words_search( pre , time , start_time , keyword , type = 'in' ): date = start_time index_name = pre + start_time while not es.indices.exists(index= index_name) : time = datetime2ts(date) + DAY date = ts2datetime(time) index_name = pre + date time -= 1 uid_set = set() for i in range(time): print index_name query = {"query":{"bool":{"must":[{"prefix":{"text.text":keyword}}],"must_not":[],"should":[]}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']} try : temp = es.search(index = index_name , doc_type = 'text' , body = query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result : uid_set.add(item['fields']['uid'][0].encode("utf-8") ) except Exception,e: print e raise Exception('user_list failed!') time = datetime2ts(date) + DAY date = ts2datetime(time) index_name = pre + date i += 1
def key_words_search( search_type , pre , during , start_time , keyword , search_key = '' , sort_norm = '', sort_scope = '' ,time = 1 , isall = False): keywords = keyword.split(",") should = [] for key in keywords: if search_type == "hashtag": should.append({"prefix":{"text.text": "#" + key + "#"}}) else: should.append({"prefix":{"text.text":key}}) date = start_time index_name = pre + start_time while not es_9206.indices.exists(index= index_name) : new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date during -= 1 uid_set = set() for i in range(during): print index_name query = {"query":{"bool":{"must":[],"must_not":[],"should":should}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']} try : temp = es_9206.search(index = index_name , doc_type = 'text' , body = query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result : uid_set.add(item['fields']['uid'][0].encode("utf-8") ) except Exception,e: print e raise Exception('user_list failed!') new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date i += 1
def get_interval_count(topic, start_ts, end_ts): results = [0] ts_list = [] #unit = 900 #during = Day during = interval_count_during start_ts = datetime2ts(ts2datetime(start_ts)) ts_list.append(start_ts) #end_ts = datetime2ts(ts2datetime(end_ts)) # deal with the time is not the whole day print 'before deal end_ts:', ts2date(end_ts) if end_ts - datetime2ts(ts2datetime(end_ts))!= 0: end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24 print 'get_interval_count start_ts:', ts2date(start_ts) print 'get_interval_count end_ts:', ts2date(end_ts) windowsize = (end_ts - start_ts) / Day interval = (end_ts - start_ts) / During for i in range(interval, 0, -1): begin_ts = end_ts - during * i over_ts = begin_ts + during ts_list.append(over_ts) items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\ PropagateCountNews.end<=over_ts ,\ PropagateCountNews.end>begin_ts ,\ PropagateCountNews.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) return ts_list, results
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) # test now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 date = ts2datetime(ts) #print 'date:', date timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity:', len(results) return results
def filter_activity(user_set): results = [] #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = RUN_TEST_TIME ts = datetime2ts(now_date) - DAY date = ts2datetime(ts) timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - DAY*i result = redis_activity.hget('activity_'+str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) return results
def update_day_sensitive(uid_list): results = {} count = 0 for uid in uid_list: results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-02') today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list) for item in sensitive_results: if not item: count += 1 continue print type(item) uid = uid_list[count] item = json.loads(item) sensitive_index = 0 sensitive_words_dict = {} for word, count in item.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[str(tmp[0])] * count sensitive_words_string = "&".join(item.keys()) results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item} count += 1 return results
def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts("2013-09-08") activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1,WEEK+1): ts = timestamp - DAY*i if WORK_TYPE != 0: r_result = redis_activity.hmget('activity_'+str(ts), uid_list) else: r_result = [] index_name = "activity_" + str(ts2datetime(ts)) exist_bool = es_cluster.indices.exists(index=index_name) if exist_bool: es_results = es_cluster.mget(index=index_name, doc_type="activity", body={"ids":uid_list})["docs"] for item in es_results: if item['found']: r_result.append(item['_source']['activity_dict']) else: r_result.append(json.dumps({})) else: r_result = [json.dumps(dict())]*len(uid_list) if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i += 1 results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)} return results
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-02") for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 count += 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = "&".join(user_hashtag_dict.keys()) all_results[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": json.dumps(user_hashtag_dict)} return all_results
def add_task( user_name ,type = "keyword",range = "all" ,pre ='flow_text_' , during = '1' , start_time ='2013-09-07' ,end_time ='2013-09-07', keyword = 'hello,world' , sort_norm = 'bci' , sort_scope = 'in_limit_keyword', time = 7, isall = False, number=100 ): time_now = int(TIME.time()) task_id = user_name + "-" + str(time_now) tmp_list = keyword.split(',') keyword_list = [] for item in tmp_list: if item: keyword_list.append(item) body_json = { 'submit_user' : user_name , 'keyword' : json.dumps(keyword_list), 'keyword_string': "&".join(keyword_list), 'submit_time' : ts2datetime(time_now), 'create_time': time_now, 'end_time' : datetime2ts(end_time), 'search_type' : type, 'status':0, 'range' : range , 'user_ts' : user_name + '-'+ str(time_now), 'pre' : pre, 'during' : during , 'start_time' : datetime2ts(start_time) , 'sort_norm' : sort_norm , 'sort_scope' : sort_scope, 'time' : time , 'isall' : isall, 'number': number } es.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=body_json) return body_json["user_ts"]
def read_flow_text(uid_list): ''' 读取用户微博(返回结果没有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 online_pattern_dict = {} # {uid:[online_pattern1, ..],...} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid,text,ts]) #test online pattern online_pattern = u'weibo.com' try: user_online_pattern_dict = online_pattern_dict[uid] except: online_pattern_dict[uid] = {} try: online_pattern_dict[uid][online_pattern] += 1 except: online_pattern_dict[uid][online_pattern] = 1 return word_dict,weibo_list, online_pattern_dict, start_date_ts
def get_db_num(): date = ts2datetime(time.time()) date_ts = datetime2ts(date) r_begin_ts = datetime2ts(R_BEGIN_TIME) db_number = ((date_ts - r_begin_ts) / (DAY * 7 )) % 2 + 1 #run_type if RUN_TYPE == 0: db_number = 1 return db_number
def main(): #step1: get task from redis queue (rpop) #step2: get monitor task time record from redis----data: {'monitor_task_time_record':{task_name, compute_start_ts}} #step3: identify the compute_start_ts can be compute #setp4: get task user from es---group_result #step5: according task user count do differently computing #step6: compute task mid-result #step7: save the mid-result in mid-result es----timestamp as field #step8: identify the track task is doing ,not end/delete from group_result es status==1 not 0 #step8: if track_task is doing: update the compute_start_ts #step9: if track_task is doing: lpush task name to redis queue (keep the task in queue) #step10: if track_task is not doing: delete the compute_start_ts from redis while True: task_name = get_task_name() if task_name: start_ts = r_task.hget('monitor_task_time_record', task_name) start_ts = int(start_ts) #now_ts = time.time() #test now_ts = datetime2ts('2013-09-08') if start_ts == now_ts: status = add_task_name(task_name) if status == 0: print 'add task to redis fail' break if start_ts + 900 <= now_ts: task_user = get_task_user(task_name) if len(task_user)==1: print 'compute %s start_ts %s' % (task_name, ts2date(start_ts)) status = compute_mid_result_one(task_name, task_user, start_ts) else: print 'compute %s start_ts %s' % (task_name, ts2date(start_ts)) status = compute_mid_result_group(task_name, task_user, start_ts) #compute group polarization----compute once a day if datetime2ts(ts2datetime(start_ts)) == start_ts: print 'start commpute group inner %s' % ts2date(start_ts) group_status = compute_group_inner(task_name, task_user, start_ts) status += group_status if status == 0: print 'there is a bug about %s task' % task_name else: #update the record time start_ts += 900 task_doing_status = identify_task_doing(task_name) print 'task_doing_status:', task_doing_status if task_doing_status == True: r_task.hset('monitor_task_time_record', task_name, start_ts) status = add_task_name(task_name) if status==0: print 'add task name to redis fail' else: r_task.hdel('monitor_task_time_record', task_name)
def sort_task(user, keyword, status, start_time, end_time, submit_time): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"submit_user": user}} ] } } } }, "size": 10000, "sort":{"submit_time":{"order":"desc"}} } query_list = [] if keyword: keyword_list = keyword.split(',') query_list.append({"terms":{"keyword_string":keyword_list}}) if status != 2: query_list.append({"term":{"status": status}}) if start_time and end_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) query_list.append({"range":{"start_time":{"gte":start_ts, "lte":end_ts}}}) query_list.append({"range":{"end_time":{"gte":start_ts, "lte":end_ts}}}) if submit_time: query_list.append({"term":{"submit_time": submit_time}}) if query_list: query_body["query"]["filtered"]["filter"]["bool"]["must"].extend(query_list) #print query_body search_results = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query_body)["hits"]["hits"] results = [] if search_results: for item in search_results: iter_item = item['_source'] tmp = [] tmp.append(iter_item['search_type']) tmp.append(json.loads(iter_item['keyword'])) tmp.append(ts2datetime(iter_item['start_time'])) tmp.append(ts2datetime(iter_item['end_time'])) tmp.append(iter_item['range']) tmp.append(ts2date(iter_item['create_time'])) tmp.append(iter_item['status']) tmp.append(iter_item['sort_norm']) tmp.append(iter_item['sort_scope']) tmp.append(item['_id']) # task_name results.append(tmp) return results
def hot_uid_by_word(starttime, endtime, count=50): '''筛选出词语表中有超过50条记录的博主 ''' startdate = ts2datetime(datetime2ts(starttime)) enddate = ts2datetime(datetime2ts(endtime)) uids = set() uids_count = db.session.query(Words.uid, func.count(Words.id)).\ filter(Words.postDate>startdate, Words.postDate<enddate).\ group_by(Words.uid).\ having(func.count(Words.id) > count).all() for uid, count in uids_count: uids.add(uid) return uids
def user_sort_interface(time = 1 , sort_norm = 'imp' , sort_scope = 'in_nolimit' , arg = None , start_time = '2013-09-01' , end_time = '2013-09-07'): uid_list = [] return_data = {} try: first_stage_time = datetime.datetime.now() #find the userid which in the scope if sort_scope == 'all_nolimit' : uid_list = all_sort_filter(time,sort_norm,sort_scope,arg) elif sort_scope == 'all_limit_keyword': during = ( datetime2ts(end_time) - datetime2ts(start_time) ) / DAY time = 1 if during > 3: time = 7 elif during > 16: time = 30 uid_list = key_words_search('flow_text_2013_',during,start_time,arg,'all') uid_list = sort_norm_filter(uid_list,sort_norm ,time) elif sort_scope == "in_limit_keyword": during = ( datetime2ts(end_time) - datetime2ts(start_time) ) / DAY time = 1 if during > 3: time = 7 elif during > 16: time = 30 uid_list = key_words_search('flow_text_',during,start_time,arg,'all') uid_list = sort_norm_filter(uid_list,sort_norm ,time) else : uid_list = in_sort_filter(time , sort_norm , sort_scope, arg) #make up the result with userid list user_info_list = make_up_user_info(uid_list) second_stage_time = datetime.datetime.now() print "info-makeup's mission complete, Time-consuming: " + str(second_stage_time - first_stage_time) #make up the JSON return data return_data['flag'] = True return_data['data'] = user_info_list third_stage_time = datetime.datetime.now() print "JSON-maker's mission complete, Time-consuming: " + str(third_stage_time - second_stage_time) return return_data except RuntimeError , e1: print "RuntimeError : " + str(e1) return_data['flag'] = False return_data['error_msg'] = "time out"
def get_group_history(admin_user, now_date): results = set() now_ts = datetime2ts(now_date) start_ts = now_ts - DAY * RECOMMEND_IN_AUTO_DATE end_ts = now_ts #search group task query_body = { 'query':{ 'bool':{ 'must':[ #{'range': {'submit_date':{'gte': start_ts, 'lt': end_ts}}}, {'term': {'submit_user': admin_user}}, {'term': {'task_type': 'analysis'}} ] } }, 'size': RECOMMEND_IN_AUTO_GROUP } try: group_results = es_group_result.search(index=group_index_name, doc_type=group_index_type,\ body=query_body, _source=False, fields=['uid_list'])['hits']['hits'] except: group_results = [] all_user_list = [] for group_item in group_results: try: uid_list = group_item['fields']['uid_list'] except: uid_list = [] all_user_list.extend(uid_list) results = set(all_user_list) return results
def test_hadoop_job_id(self): date = '2013-03-01' ts = datetime2ts(date) window_size = 1 topic_id = 1 job_id = generate_job_id(ts, window_size, topic_id) self.assertEqual(job_id, '2013_03_01_1_1', 'wrong job id')
def mapper_bci_today(todaydate=None): if todaydate: BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + ts2datetime((datetime2ts(todaydate) - DAY)).replace("-","") TODAY_TIME = todaydate else : BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + '20130901' TODAY_TIME = '2013-09-02' s_re = scan(es_9200, query={"query":{"match_all":{}},"size":MAX_ITEMS ,"fields":[TOTAL_NUM,TODAY_BCI]}, index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE) count = 0 array = [] while 1: try: temp = s_re.next() one_item = {} one_item['id'] = temp['_id'].encode("utf-8") one_item['total_num'] = temp['fields'][TOTAL_NUM][0] one_item['today_bci'] = temp['fields'][TODAY_BCI][0] one_item['update_time'] = TODAY_TIME array.append(one_item) count += 1 if count % 1000 == 0: r_flow.lpush('update_bci_list', json.dumps(array)) array = [] count = 0 except StopIteration: print "all done" r_flow.lpush('update_bci_list', json.dumps(array)) break
def get_attr_geo_track(uid_list): date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day now_ts = time.time() now_date = ts2datetime(now_ts) #test now_date = '2013-09-08' ts = datetime2ts(now_date) for i in range(7, 0, -1): timestamp = ts - i*24*3600 #print 'timestamp:', ts2datetime(timestamp) ip_dict = dict() results = r_cluster.hmget('ip_'+str(timestamp), uid_list) #print 'results:',results for item in results: if item: item_dict = json.loads(item) #print 'item_dict:', item_dict for ip_item in item_dict: try: ip_dict[ip_item] += item_dict[ip_item] except: ip_dict[ip_item] = item_dict[ip_item] geo_dict = ip2geo(ip_dict) sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True) date_key = ts2datetime(timestamp) date_results.append([date_key, sort_geo_dict[:2]]) #print 'results:', date_results return {'geo_track': json.dumps(date_results)}
def scan_retweet(tmp_file): count = 0 ret_count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) retweet_redis = daily_retweet_redis start_ts = time.time() while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') if len(item_list)==2: ret_count += 1 uid = item_list[1] item_result = retweet_redis.hgetall(item) write_tmp_file(tmp_file, uid, item_result) end_ts = time.time() #run_type # if RUN_TYPE == 0: #print '%s sec scan %s count user:'******'total %s sec scan %s count user and %s retweet count' %(end_ts - now_ts, count, ret_count)
def save_at(uid, at_uid, timestamp, sensitive): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) ruid_count_dict = dict() sensitive_ruid_count_dict = dict() ruid_count_string = redis_cluster.hget('at_'+str(ts), str(uid)) if ruid_count_string: ruid_count_dict = json.loads(ruid_count_string) if ruid_count_dict.has_key(str(at_uid)): ruid_count_dict[str(at_uid)] += 1 else: ruid_count_dict[str(at_uid)] = 1 else: ruid_count_dict[str(at_uid)] = 1 redis_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) if sensitive: sensitive_ruid_count_string = redis_cluster.hget('sensitive_at_'+str(ts), str(uid)) if sensitive_ruid_count_string: sensitive_ruid_count_dict = json.loads(sensitive_ruid_count_string) if sensitive_ruid_count_dict.has_key(str(at_uid)): sensitive_ruid_count_dict[str(at_uid)] += 1 else: sensitive_ruid_count_dict[str(at_uid)] = 1 else: sensitive_ruid_count_dict[str(at_uid)] = 1 redis_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(sensitive_ruid_count_dict))
def save_city(uid, ip, timestamp, sensitive): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) ip_count_dict = dict() sensitive_ip_count_dict = dict() ip_count_string = redis_ip.hget('ip_'+str(ts), str(uid)) if ip_count_string: ip_count_dict = json.loads(ip_count_string) if ip_count_dict.has_key(str(ip)): ip_count_dict[str(ip)] += 1 else: ip_count_dict[str(ip)] = 1 else: ip_count_dict[str(ip)] = 1 redis_ip.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) if sensitive: sensitive_ip_count_string = redis_ip.hget('sensitive_ip_'+str(ts), str(uid)) if sensitive_ip_count_string: sensitive_ip_count_dict = json.loads(sensitive_ip_count_string) if sensitive_ip_count_dict.has_key(str(ip)): sensitive_ip_count_dict[str(ip)] += 1 else: sensitive_ip_count_dict[str(ip)] = 1 else: sensitive_ip_count_dict[str(ip)] = 1 redis_ip.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(sensitive_ip_count_dict))
def save_activity(uid, timestamp, time_segment, sensitive): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(ts) activity_count_dict = dict() sensitive_activity_count_dict = dict() activity_count_string = redis_activity.hget('activity_' + key, str(uid)) if activity_count_string: activity_count_dict = json.loads(activity_count_string) if activity_count_dict.has_key(str(time_segment)): activity_count_dict[str(time_segment)] += 1 else: activity_count_dict[str(time_segment)] = 1 else: activity_count_dict[str(time_segment)] = 1 redis_activity.hset('activity_' + key, str(uid), json.dumps(activity_count_dict)) if sensitive: sensitive_activity_count_string = redis_activity.hget('sensitive_activity_' + key, str(uid)) if sensitive_activity_count_string: sensitive_activity_count_dict = json.loads(sensitive_activity_count_string) if sensitive_activity_count_dict.has_key(str(time_segment)): sensitive_activity_count_dict[str(time_segment)] += 1 else: sensitive_activity_count_dict[str(time_segment)] = 1 else: sensitive_activity_count_dict[str(time_segment)] = 1 redis_activity.hset('sensitive_activity_' + key, str(uid), json.dumps(sensitive_activity_count_dict))
def get_sensing_history(admin_user, now_date): results = set() now_ts = datetime2ts(now_date) start_ts = now_ts - DAY * RECOMMEND_IN_AUTO_DATE end_ts = now_ts #search social sensing task query_body = { 'query':{ 'bool':{ 'must':[ #{'range': {'create_at': {'gte': start_ts, 'lt': end_ts}}}, {'term': {'create_by': admin_user}} ] } }, 'size': RECOMMEND_IN_AUTO_GROUP } try: sensing_result = es_social_sensing.search(index=sensing_index_name, doc_type=sensing_doc_type,\ body=query_body, _source=False, fields=['social_sensors'])['hits']['hits'] except: sensing_result = [] sensing_user_list = [] for task_item in sensing_result: user_list = json.loads(task_item['fields']['social_sensors'][0]) sensing_user_list.extend(user_list) results = set(sensing_user_list) return results
def uid_lists2fb_from_flow_text(monitor_keywords_list, uid_list): nest_query_list = [] for monitor_keyword in monitor_keywords_list: nest_query_list.append( {'wildcard': { 'keywords_string': '*' + monitor_keyword + '*' }}) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'size': TOP_WEIBOS_LIMIT, 'sort': { 'timestamp': { 'order': 'desc' } } } if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_FB) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) index_name_flow = facebook_flow_text_index_name_pre + datetime es_results = es.search(index=index_name_flow, doc_type=facebook_flow_text_index_type, body=query_body)['hits']['hits'] results_all = [] for result in es_results: result = result['_source'] uid = result['uid'] nick_name, photo_url = fb_uid2nick_name_photo(uid) result['nick_name'] = nick_name result['photo_url'] = photo_url results_all.append(result) return results_all
def create_facebook_warning(): #时间设置 if S_TYPE == 'test': test_day_date = FACEBOOK_FLOW_START_DATE today_datetime = datetime2ts(test_day_date) - DAY start_time = today_datetime end_time = today_datetime operate_date = ts2datetime(start_time) else: now_time = int(time.time()) today_datetime = datetime2ts(ts2datetime(now_time)) - 8 * DAY start_time = today_datetime #前一天0点 end_time = today_datetime #定时文件启动的0点 operate_date = ts2datetime(start_time) account_list = get_user_account_list() # account_list = ['*****@*****.**'] for account in account_list: xnr_list = get_user_xnr_list(account) # print xnr_list #xnr_list=['FXNR0005'] for xnr_user_no in xnr_list: print 'xnr_user_no:', xnr_user_no #人物行为预警 personal_mark = create_personal_warning(xnr_user_no, today_datetime) #言论内容预警 speech_mark = create_speech_warning(xnr_user_no, today_datetime) speech_mark = True #事件涌现预警 create_event_warning(xnr_user_no, today_datetime, write_mark=True) #时间预警 date_mark = create_date_warning(today_datetime) return True
def get_influence(uid_list): result = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date = ts2datetime(now_ts - DAY) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) index_time = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' try: es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list}, _source=False, fields=['user_index'])['docs'] except Exception, e: raise e
def compute_keywords_mark(): xnr_user_list = lookup_xnr_user_list() # xnr_user_list=['FXNR0001'] print 'xnr_user_list:', xnr_user_list now_time = int(time.time()) - DAY date_time = ts2datetime(now_time) mark_list = [] for xnr_user_no in xnr_user_list: keywords_task_detail = dict() keyword_value_string = json.dumps(xnr_keywords_compute(xnr_user_no)) keywords_task_detail['keyword_value_string'] = keyword_value_string keywords_task_detail['xnr_user_no'] = xnr_user_no #keywords_task_detail['date_time']=date_time #keywords_task_detail['timestamp']=datetime2ts(date_time) if S_TYPE == 'test': keywords_task_id = xnr_user_no + '_' + test_date keywords_task_detail['timestamp'] = datetime2ts(test_date) keywords_task_detail['date_time'] = test_date print 'keywords_task_detail:', test_date else: keywords_task_id = xnr_user_no + '_' + date_time keywords_task_detail['timestamp'] = datetime2ts(date_time) keywords_task_detail['date_time'] = date_time print 'keywords_task_detail:', date_time try: es_xnr_2.index(index=facebook_keyword_count_index_name, doc_type=facebook_keyword_count_index_type, body=keywords_task_detail, id=keywords_task_id) mark = True except: mark = False mark_list.append(mark) print 'mark_list:', mark_list return mark_list
def create_xnr_targetuser(xnr_user_no): # #step1:查找虚拟人列表 # xnr_user_no_list = get_compelete_fbxnr() #step2:设置时间范围 if S_TYPE == 'test': now_time = datetime2ts(FACEBOOK_COMMUNITY_DATE) else: now_time = int(time.time()) end_ts = datetime2ts(ts2datetime(now_time)) start_ts = end_ts - COMMUNITY_TERM * DAY datetime_list = [] if start_ts != end_ts: iter_date_ts = end_ts while iter_date_ts >= start_ts: start_date = ts2datetime(iter_date_ts) datetime_list.append(start_date) iter_date_ts = iter_date_ts - DAY else: start_date = ts2datetime(start_ts) datetime_list.append(start_date) #step3:分虚拟人创建种子用户 # for xnr_user_no in xnr_user_no_list: #step3.1:查找虚拟人发布的关键词 xnr_keywords = get_xnr_keywords(xnr_user_no, datetime_list) #step3.2:查找虚拟人的关注用户或好友 xnr_relationer = get_xnr_relationer(xnr_user_no) #step3.3:基于关键词和种子用户扩展用户 expand_userid_list = get_expand_userid_list(xnr_keywords, xnr_relationer, datetime_list) return expand_userid_list
def update_day_sensitive(uid_list): results = {} for uid in uid_list: results[uid] = { "sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({}) } all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-03') today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list) for item in sensitive_results: for uid, words_dict in item.iteritems(): sensitive_index = 0 sensitive_words_dict = {} if words_dict: sensitive_words_dict = json.dumps(words_dict) for word, count in words_dict.iter_items(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[tmp[0]] * count sensitive_words_string = "&".join(sensitive_words_dict.keys()) results[uid] = { 'sensitive': sensitive_index, "sensitive_words_string": sensitive_words_string, "sensitive_words_dict": sensitive_words_dict } return results
def compute_network_task(network_task_information): results = {} #step1: get task information start_date = network_task_information['start_date'] start_ts = datetime2ts(start_date) end_date = network_task_information['end_date'] end_ts = datetime2ts(end_date) iter_date_ts = start_ts to_date_ts = end_ts iter_query_date_list = [] # ['2013-09-01', '2013-09-02'] while iter_date_ts <= to_date_ts: iter_date = ts2datetime(iter_date_ts) iter_query_date_list.append(iter_date) iter_date_ts += DAY #step2: get iter search flow_text_index_name #step2.1: get search keywords list query_must_list = [] keyword_nest_body_list = [] keywords_string = network_task_information['query_keywords'] keywords_list = keywords_string.split('&') for keywords_item in keywords_list: keyword_nest_body_list.append( {'wildcard': { 'text': '*' + keywords_item + '*' }}) query_must_list.append({'bool': {'should': keyword_nest_body_list}}) query_must_list.append({'term': {'message_type': '3'}}) #step2.2: iter search by date results = [] for iter_date in iter_query_date_list: flow_text_index_name = flow_text_index_name_pre + iter_date query_body = {'query': {'bool': {'must': query_must_list}}} flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] results.extend(flow_text_result) return results
def get_recommend_at_user(xnr_user_no): #_id = user_no2_id(user_no) es_result = es.get(index=tw_xnr_index_name, doc_type=tw_xnr_index_type, id=xnr_user_no)['_source'] #print 'es_result:::',es_result if es_result: uid = es_result['uid'] daily_interests = es_result['daily_interests'] if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_TW) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) index_name = twitter_flow_text_index_name_pre + datetime nest_query_list = [] daily_interests_list = daily_interests.split('&') es_results_daily = es.search(index=index_name,doc_type=twitter_flow_text_index_type,\ body={'query':{'match_all':{}},'size':200,\ 'sort':{'timestamp':{'order':'desc'}}})['hits']['hits'] uid_list = [] if es_results_daily: for result in es_results_daily: result = result['_source'] uid_list.append(result['uid']) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=twitter_user_index_name, doc_type=twitter_user_index_type, body={'ids': uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['name'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= DAILY_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict
def create_twitter_bci_data(uid, date): tw_bci_index_name = tw_bci_index_name_pre + date tw_bci_mappings(tw_bci_index_name) data = { 'active': random.choice([1, 1, 1, 1, 1, 2, 3]), 'propagate': random.choice([1, 1, 1, 1, 1, 1]), 'cover': random.choice([1, 1, 1, 1, 12, 18, 31, 43, 90, 201]), 'trust': random.choice([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'influence': random.choice([10, 10, 20, 12]), 'uid': uid, 'timestamp': datetime2ts(date), } print es.index(index=tw_bci_index_name, doc_type=tw_bci_index_type, id=uid, body=data)
def save_results_to_es(xnr_user_no, current_date, sort_item, result): item_body = {} item_body['xnr_user_no'] = xnr_user_no item_body['sort_item'] = sort_item item_body['result'] = json.dumps(result) item_body['timestamp'] = datetime2ts(current_date) _id = xnr_user_no + '_' + sort_item index_name = active_social_index_name_pre + current_date es.index(index=index_name, doc_type=active_social_index_type, body=item_body, id=_id)
def get_activeness(uid, activity_geo): result = 0 now_ts = time.time() now_date = ts2datetime(now_ts) timestamp = datetime2ts(now_date) #test #timestamp = datetime2ts('2013-09-08') # deal activity_time fft and statusnum activity_list = [] statusnum = 0 for i in range(1,8): ts = timestamp - 24*3600*i r_result = r_cluster.hget('activity_'+str(ts), uid) if r_result: r_result = json.loads(r_result) #print 'r_result:', r_result for i in range(0,96): try: count = r_result[str(i)] except: count = 0 activity_list.append(float(count)) #print 'activity_list:', activity_list statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal)) ** 2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: #print str(1/freq[i]) + ',' + str(val) if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i = i + 1 #print 'i:', i #print 'max_freq, max_val:', max_freq, max_val # deal avtivity_geo input: 'geo&geo' activity_geo_count = len(activity_geo.split('&')) result = activeness_weight_dict['activity_time'] * math.log(max_freq + 1) + \ activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\ activeness_weight_dict['statusnum'] * math.log(statusnum + 1) #print 'activeness:', result return result
def save_city_timestamp(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) try: ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid)) ip_timestamp_string_dict = json.loads(ip_timestamp_string) try: add_string = '&' + str(timestamp) ip_timestamp_string_dict[str(ip)] += add_string except: ip_timestamp_string_dict[str(ip)] = str(timestamp) r_cluster.hset('new_ip_' + str(ts), str(uid), json.dumps(ip_timestamp_string_dict)) except: r_cluster.hset('new_ip_' + str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
def save_user_warning(xnr_user_no,start_time,end_time): #判断数据库是否存在: today_date=ts2datetime(end_time) today_datetime = datetime2ts(today_date) weibo_user_warning_index_name=weibo_user_warning_index_name_pre+today_date if not es_xnr.indices.exists(index=weibo_user_warning_index_name): weibo_user_warning_mappings(weibo_user_warning_index_name) new_user_warning = create_personal_warning(xnr_user_no,start_time,end_time) today_history_user_warning,old_uid_list = lookup_history_user_warming(xnr_user_no,today_datetime,end_time) results = [] if new_user_warning: for item in new_user_warning: id_mark = set_intersection(item['uid'],old_uid_list) if id_mark == 1: #组合,更新数据库 task_id = xnr_user_no+'_'+item['uid'] old_user = es_xnr.get(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,id=task_id)['_source'] old_user['content'] = json.loads(old_user['content']) old_user['content'].extend(item['content']) old_user['user_sensitive'] = old_user['user_sensitive'] + item['user_sensitive'] #old_user['user_influence'] = old_user['user_influence'] + item['user_influence'] try: es_xnr.index(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,body=old_user,id=task_id) mark=True except: mark=False else: #直接存储 task_id=xnr_user_no+'_'+item['uid'] try: es_xnr.index(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,body=item,id=task_id) mark=True except: mark=False results.append(mark) else: pass print 'person_mark::',results return results
def get_hot_recommend_tweets(xnr_user_no,topic_field,sort_item): topic_field_en = topic_ch2en_dict[topic_field] if sort_item != 'compute_status': query_body = { 'query':{ 'bool':{ 'must':[ { 'filtered':{ 'filter':{ 'term':{'topic_field':topic_field_en} } } } ] } }, 'sort':{sort_item:{'order':'desc'}}, 'size':TOP_WEIBOS_LIMIT } current_time = time.time() if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) #fb_social_sensing_index_name = fb_social_sensing_index_name_pre + ts2datetime(current_time) es_results = es.search(index=fb_social_sensing_index_name,doc_type=fb_social_sensing_index_type,body=query_body)['hits']['hits'] if not es_results: es_results = es.search(index=fb_social_sensing_index_name,doc_type=fb_social_sensing_index_type,\ body={'query':{'match_all':{}},'size':TOP_WEIBOS_LIMIT,\ 'sort':{sort_item:{'order':'desc'}}})['hits']['hits'] results_all = [] for result in es_results: result = result['_source'] uid = result['uid'] nick_name,photo_url = fb_uid2nick_name_photo(uid) result['nick_name'] = nick_name result['photo_url'] = photo_url results_all.append(result) return results_all
def save_results(save_type, recomment_results): save_mark = False #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) recomment_hash_name = 'recomment_' + now_date + '_auto' if save_type == 'hotspot': #print 'save hotspot results' R_RECOMMENTATION.hset(recomment_hash_name, 'auto', json.dumps(recomment_results)) save_mark = True elif save_type == 'operation': #print 'save operation results' R_RECOMMENTATION.hmset(recomment_hash_name, recomment_results) save_mark = True return save_mark
def make_up_user_info(user_list = []): result_info = [] today = str(datetime.date.today()) today = '2013-09-07' timestamp = datetime2ts(today) print len(user_list) if user_list: for id in user_list: item = {} item['uid'] = id item['is_warehousing'] , item['uname'], item['weibo_num'] , item['location'] , item['fansnum'] = user_portrait_info(id) item['bci_day_last'] = history_info(BCIHISTORY_INDEX_NAME,BCIHISTORY_INDEX_TYPE,id,['bci_day_last']) item['sen_day_last'] = history_info(SESHISTORY_INDEX_NAME,BCIHISTORY_INDEX_TYPE,id,['sensitive_score_' + str(timestamp) ]) result_info.append(item) return result_info else: return []
def compute_history_number(xnr_qq_number): query_body_history = { "query": { "filtered": { "filter": { "bool": { "must": [{ "term": { "xnr_qq_number": xnr_qq_number } }] } } } }, "size": MAX_VALUE, # "sort":{"timestamp":{"order":"desc"}} } enddate = datetime.datetime.now().strftime('%Y-%m-%d') startdate = ts2datetime( datetime2ts(enddate) - group_message_windowsize * DAY) index_names = get_groupmessage_index_list(startdate, enddate) print index_names results = {} for index_name in index_names: # if not es_xnr.indices.exsits(index=index_name): # continue try: # 这里是把index名字改成发送表的名 sent_group_message_2017-07-07 result = es.search(index='sent_' + index_name, doc_type=group_message_index_type, body=query_body_history) if results != {}: results['hits']['hits'].extend(result['hits']['hits']) else: results = result.copy() except: pass if results != {}: history_num = len(results['hits']['hits']) else: return 0 return history_num
def inner_group_retweet(item): root_uid = str(item['root_uid']) uid = str(item['uid']) timestamp = item['timestamp'] date = ts2datetime(timestamp) date_ts = datetime2ts(date) time_segment = int((timestamp - date_ts) / 900) start_ts = date_ts + time_segment * 900 key = 'inner_' + str(start_ts) inner_retweet_exist = monitor_inner_r.hget(root_uid, key) if not inner_retweet_exist: monitor_inner_r.hset(root_uid, key, json.dumps({uid: 1})) else: inner_retweet_dict = json.loads(inner_retweet_exist) if uid in inner_retweet_dict: inner_retweet_dict[uid] += 1 else: inner_retweet_dict[uid] = 1 monitor_inner_r.hset(root_uid, key, json.dumps(inner_retweet_dict))
def get_all_filed(sort_norm, time): ts = datetime2ts(ts2datetime(TIME.time() - DAY)) field_bci = 'bci_day_last' field_weibo = "weibo_month_sum" field_sen = 'sensitive_score_%s' % ts if sort_norm == "weibo_num": if time == 1: field_weibo = 'weibo_day_last' if time == 7: field_weibo = "weibo_week_sum" elif time == 30: field_weibo = "weibo_month_sum" else: pass if sort_norm == 'bci': if time == 1: field_bci = 'bci_day_last' elif time == 7: field_bci = 'bci_week_ave' else: field_bci = 'bci_month_ave' elif sort_norm == 'bci_change': if time == 1: field_bci = 'bci_day_change' elif time == 7: field_bci = 'bci_week_change' else: field_bci = 'bci_month_change' elif sort_norm == 'ses': if time == 1: field_sen = 'sensitive_score_%s' % ts elif time == 7: field_sen = 'sensitive_week_ave' else: field_sen = 'senstiive_month_ave' elif sort_norm == 'ses_change': if time == 1: field_sen = 'sensitive_day_change' elif time == 7: field_sen = 'sensitive_week_change' else: field_sen = 'sensitive_month_change' return field_bci, field_sen, field_weibo
def main(): uid_list = [] count = 0 with open('uid_list_0520.txt', 'rb') as f: for item in f: uid_list.append(item.strip()) print "uid_list: ", len(uid_list) print uid_list[:3] query_body = { "query": { "filtered": { "filter": { "terms": { "uid": uid_list } } } }, "size": 100000 } with open('uid_text_0523.txt', 'wb') as f_txt: #ts = datetime2ts(ts2datetime(time.time()-24*3600)) ts = datetime2ts(ts2datetime(time.time())) #today while 1: date = ts2datetime(ts) index_name = "flow_text_" + str(date) print index_name exist_bool = es_flow_text.indices.exists(index=index_name) if not exist_bool: break search_results = es_flow_text.search( index=index_name, doc_type="text", body=query_body)["hits"]["hits"] print len(search_results) if search_results: for item in search_results: f_txt.write(json.dumps(item['_source']) + "\n") count += 1 ts = ts - 24 * 3600 break print count
def get_user_at(): #step1: get_uid_list uid_list = get_uid_list() date = ts2datetime(time.time()) ts = datetime2ts(date) f = open( '/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt', 'w') for i in range(1, 8): ts = ts - DAY for uid in uid_list: #try: result_string = r_cluster.hget('at_' + str(ts), uid) #except: # result_string = '' if result_string: save_dict = {'ts': ts, 'result': result_string} f.write('%s\n' % json.dumps(save_dict)) f.close()
def mapper_bci_today(todaydate): BCI_INDEX_NAME = "bci_" + ts2datetime( (datetime2ts(todaydate) - DAY)).replace("-", "") TODAY_TIME = todaydate print BCI_INDEX_NAME s_re = scan(es_9200, query={ "query": { "match_all": {} }, "size": MAX_ITEMS, "fields": [TOTAL_NUM, TODAY_BCI, "user_fansnum", 'user_friendsnum'] }, index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE) count = 0 array = [] while 1: try: temp = s_re.next() one_item = {} one_item['id'] = temp['_id'].encode("utf-8") one_item['user_fansnum'] = temp['fields']["user_fansnum"][0] one_item['user_friendsnum'] = temp['fields']['user_friendsnum'][0] one_item['total_num'] = temp['fields'][TOTAL_NUM][0] one_item['today_bci'] = temp['fields'][TODAY_BCI][0] one_item['update_time'] = TODAY_TIME array.append(one_item) count += 1 if count % 1000 == 0: r_flow.lpush('update_bci_list', json.dumps(array)) array = [] #if count % 100000 == 0: # print count except StopIteration: print "all done" if array: r_flow.lpush('update_bci_list', json.dumps(array)) break print count
def create_fb_warning(): #时间设置 now_time = int(time.time()) #生成当前时间周期内的 start_time = datetime2ts(ts2datetime(now_time)) #生成表 for i in range(0, 3, 1): datetime = start_time - i * DAY datename = ts2datetime(datetime) facebook_user_warning_mappings(datename) facebook_event_warning_mappings(datename) facebook_speech_warning_mappings(datename) date_result = lookup_date_info(datetime) facebook_timing_warning_mappings(date_result) account_list = get_user_account_list() for account in account_list: xnr_list = get_user_xnr_list(account) for xnr_user_no in xnr_list: for i in range(0, 3, 1): task_dict = dict() task_dict['xnr_user_no'] = xnr_user_no task_dict['today_datetime'] = start_time - i * DAY #将计算任务加入队列 r_warning.lpush(fb_user_warning_task_queue_name, json.dumps(task_dict)) r_warning.lpush(fb_speech_warning_task_queue_name, json.dumps(task_dict)) r_warning.lpush(fb_event_warning_task_queue_name, json.dumps(task_dict)) #时间预警 time_task = dict() for i in range(0, 3, 1): time_task['today_datetime'] = start_time - i * DAY r_warning.lpush(fb_time_warning_task_queue_name, json.dumps(time_task)) return True
def get_tweets_from_bci(monitor_keywords_list, sort_item_new): if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_BCI_TW) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) # datetime_new = datetime[0:4]+datetime[5:7]+datetime[8:10] datetime_new = datetime index_name = tw_bci_index_name_pre + datetime_new query_body = { 'query': { 'match_all': {} }, 'sort': { sort_item_new: { 'order': 'desc' } }, 'size': BCI_USER_NUMBER } es_results_bci = es.search(index=index_name, doc_type=tw_bci_index_type, body=query_body)['hits']['hits'] #print 'es_results_bci::',es_results_bci #print 'index_name::',index_name #print '' uid_set = set() if es_results_bci: for result in es_results_bci: uid = result['_id'] uid_set.add(uid) uid_list = list(uid_set) es_results = uid_lists2tw_from_flow_text(monitor_keywords_list, uid_list) return es_results
def retweet(xnr_info, date): global EXCEPTION ts = datetime2ts(date) facebook_feedback_retweet_mappings( facebook_feedback_retweet_index_name_pre + date) redis_key = 'facebook_feedback_retweet_data' xnr_user_no = xnr_info['xnr_user_no'] lis = load_data(xnr_user_no, redis_key) # {'uid', 'nick_name', 'mid', 'timestamp', 'text', 'update_time', 'root_text', 'root_mid'} data = [] for item in lis: try: uid = item['uid'] text = item['text'] if uid in xnr_info['friends_list']: facebook_type = u"好友" else: facebook_type = u"陌生人" sensitive_info, sensitive_user = sensitive_func(ts, text, uid) d = { 'uid': uid, 'text': text, 'nick_name': item['nick_name'], 'mid': item['mid'], 'timestamp': item['timestamp'], 'update_time': item['update_time'], 'root_text': item['root_text'], 'root_mid': item['root_mid'], 'photo_url': '', 'root_uid': xnr_info['root_uid'], 'root_nick_name': xnr_info['root_nick_name'], 'facebook_type': facebook_type, 'sensitive_info': sensitive_info, 'sensitive_user': sensitive_user, 'retweet': 0, 'comment': 0, 'like': 0 } data.append(d) except Exception, e: EXCEPTION += '\n retweet Exception: ' + str(e)
def create_task_list(): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') if S_TYPE == 'test': now_ts = datetime2ts(S_DATE) - 3600 * 10 else: now_ts = datehour2ts(ts2datehour(time.time() - 3600)) print_log = "&".join([file_path, "start", ts2date(now_ts)]) print print_log #ts = ts - 3600 query_body = {"query": {"match_all": {}}} search_results = es.search(index=index_sensing, doc_type=type_sensing, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name try: task.append(json.loads( item['social_sensors'])) # social sensors except: task.append(item['social_sensors']) # social sensors task.append(now_ts) #task.append(item['xnr_user_no']) #task.append(given_ts) r.lpush("task_name", json.dumps(task)) count += 1 print count print_log = "&".join([file_path, "end", ts2date(time.time())]) print print_log
def filter_mention(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' timestamp = datetime2ts(now_date) - 24 * 3600 for user in user_set: mention_set = set() for i in range(0, 7): ts = timestamp - 3600 * 24 * i result = r_cluster.hget('at_' + str(ts), str(user)) if result: item_dict = json.loads(result) for at_user in item_dict: mention_set.add(at_user) if at_count < mention_threshold: results.append(user) else: writer.writerow([user, 'mention']) print 'after filter mention: ', len(results) return results
def get_all_data(): topic_list = [ u'东盟,博览会', u'全军政治工作会议', u'外滩踩踏', u'高校思想宣传', u'APEC', u'张灵甫遗骨疑似被埋羊圈' ] time_range_list = [('2013-09-08', 6), ('2014-11-16', 17), ('2015-01-10', 10), ('2015-02-01', 9), ('2014-11-20', 15), ('2015-02-02', 10)] result = {} result_list = [] for i in range(len(topic_list)): topic_name = topic_list[i] end_date = time_range_list[i][0] windowsize = time_range_list[i][1] end_ts = datetime2ts(end_date) start_ts = end_ts - Day * windowsize print 'start compute topic:', topic_name result = get_topic_data(topic_name, start_ts, end_ts) result_list.append(result) return json.dumps(result_list)
def make_up_user_info(user_list=[], isall=False, time=1, sort_norm="bci"): result_info = [] if RUN_TYPE: today = str(datetime.date.today()) else: today = '2013-09-07' timestamp = datetime2ts(today) #print len(user_list) if user_list: for id in user_list: item = {} if isall: item = all_makeup_info(id, sort_norm, time) else: item = in_makeup_info(id, sort_norm, time) result_info.append(item) return result_info else: return []
def cal_hashtag_work(uid, hashtag_list, timestamp, sensitive): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) hashtag_dict = {} sensitive_hashtag_dict = dict() for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 hashtag_count_string = redis_cluster.hget('hashtag_' + str(ts), str(uid)) if hashtag_count_string: hashtag_count_dict = json.loads(hashtag_count_string) for item in hashtag_list: if hashtag_count_dict.has_key(item): hashtag_count_dict[item] += 1 else: hashtag_count_dict[item] = 1 else: hashtag_count_dict = hashtag_dict redis_cluster.hset('hashtag_' + str(ts), str(uid), json.dumps(hashtag_count_dict)) if sensitive: sensitive_hashtag_count_string = redis_cluster.hget( 'sensitive_hashtag_' + str(ts), str(uid)) if sensitive_hashtag_count_string: sensitive_hashtag_count_dict = json.loads( sensitive_hashtag_count_string) for hashtag in hashtag_list: if sensitive_hashtag_count_dict.has_key(hashtag): sensitive_hashtag_count_dict[hashtag] += 1 else: sensitive_hashtag_count_dict[hashtag] = 1 else: sensitive_hashtag_count_dict = hashtag_dict redis_cluster.hset('sensitive_hashtag_' + str(ts), str(uid), json.dumps(sensitive_hashtag_count_dict))