def compute_group_inner(task_name, task_user, start_ts): #step1: get task_user in-monitor task user retweet relation from monitor_inner_r #step2: get task_user in-task user retweet relation #step3: compute every inner user be-retweet ratio in task #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date e:'inner_2013-09-01' group_status = 0 time_segment = 3600 * 24 iter_time_segment = 900 iter_ts = start_ts - time_segment inner_group_dict = {} user_count_dict = {} print 'group inner ask_user:'******''' if iter_ts >= start_ts: break ''' key = 'inner_' + str(iter_ts) print 'iter_ts:', ts2date(iter_ts) inner_retweet_string = monitor_inner_r.hget(root_uid, key) print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string if inner_retweet_string: print 'yes' inner_retweet_dict = json.loads(inner_retweet_string) else: inner_retweet_dict = None if inner_retweet_dict: inner_group_dict[root_uid] = merge_dict( inner_group_dict[root_uid], inner_retweet_dict) iter_ts += iter_time_segment user_inner_retweet_count = sum(inner_group_dict[root_uid].values()) user_count_dict[root_uid] = user_inner_retweet_count all_be_retweet_count = sum(user_count_dict.values()) if all_be_retweet_count == 0: group_status = 1 return group_status sort_user_inner_retweet_count = sorted(user_count_dict.items(), key=lambda x: x[1], reverse=True) top5_user = sort_user_inner_retweet_count[:5] # timestamp: '2013-09-01' date = ts2datetime(start_ts - 24 * 3600) index_body = {'date': date} for rank in range(1, 6): key = 'top' + str(rank) index_body[key] = json.dumps(top5_user[rank - 1]) key = 'inner_' + date # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...} index_body['inner_graph'] = json.dumps(inner_group_dict) es.index(index=monitor_index_name, doc_type=task_name, id=key, body=index_body) group_status = 1 return group_status
def compute_group_inner(task_name, task_user, start_ts): #step1: get task_user in-monitor task user retweet relation from monitor_inner_r #step2: get task_user in-task user retweet relation #step3: compute every inner user be-retweet ratio in task #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date e:'inner_2013-09-01' group_status = 0 time_segment = 3600*24 iter_time_segment = 900 iter_ts = start_ts - time_segment inner_group_dict = {} user_count_dict = {} print 'group inner ask_user:'******''' if iter_ts >= start_ts: break ''' key = 'inner_' + str(iter_ts) print 'iter_ts:', ts2date(iter_ts) inner_retweet_string = monitor_inner_r.hget(root_uid, key) print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string if inner_retweet_string: print 'yes' inner_retweet_dict = json.loads(inner_retweet_string) else: inner_retweet_dict = None if inner_retweet_dict: inner_group_dict[root_uid] = merge_dict(inner_group_dict[root_uid], inner_retweet_dict) iter_ts += iter_time_segment user_inner_retweet_count = sum(inner_group_dict[root_uid].values()) user_count_dict[root_uid] = user_inner_retweet_count all_be_retweet_count = sum(user_count_dict.values()) if all_be_retweet_count==0: group_status = 1 return group_status sort_user_inner_retweet_count = sorted(user_count_dict.items(), key=lambda x:x[1], reverse=True) top5_user = sort_user_inner_retweet_count[:5] # timestamp: '2013-09-01' date = ts2datetime(start_ts - 24*3600) index_body = {'date': date} for rank in range(1,6): key = 'top' + str(rank) index_body[key] = json.dumps(top5_user[rank-1]) key = 'inner_' + date # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...} index_body['inner_graph'] = json.dumps(inner_group_dict) es.index(index=monitor_index_name, doc_type=task_name, id=key, body=index_body) group_status = 1 return group_status
def save_mid_result_one(task_name, sensitive_weibo_dict, geo_weibo_dict, sentiment_weibo_dict, hashtag_weibo_dict, sensitive_word_dict, start_ts): status = 0 insert_body = {} insert_body['count'] = json.dumps(sensitive_weibo_dict) insert_body['geo'] = json.dumps(geo_weibo_dict) insert_body['sentiment'] = json.dumps(sentiment_weibo_dict) if hashtag_weibo_dict != {}: insert_body['hashtag'] = json.dumps(hashtag_weibo_dict) if sensitive_word_dict != {}: insert_body['sensitive_word'] = json.dumps(sensitive_word_dict) insert_body['timestamp'] = start_ts # mark the ts es.index(index=monitor_index_name, doc_type=task_name, id=start_ts, body=insert_body) status = 1 return status
def save_mid_result_group(task_name, sensitive_weibo_dict, geo_weibo_dict, sentiment_weibo_dict, hashtag_weibo_dict, sensitive_word_dict, start_ts): status = 0 print 'start save result to es' insert_body = {} insert_body['count'] = json.dumps(sensitive_weibo_dict) insert_body['geo'] = json.dumps(geo_weibo_dict) insert_body['sentiment'] = json.dumps(sentiment_weibo_dict) if hashtag_weibo_dict != {}: insert_body['hashtag'] = json.dumps(hashtag_weibo_dict) if sensitive_word_dict != {}: insert_body['sensitive_word'] = json.dumps(sensitive_word_dict) insert_body['timestamp'] = start_ts # other attribute about monitor group should be add es.index(index=monitor_index_name, doc_type=task_name, id=start_ts, body=insert_body) status = 1 print 'end save result' return status
def init_custom_attribute(): index_info = { 'settings':{ 'analysis':{ 'analyzer':{ 'my_analyzer':{ 'type': 'pattern', 'pattern': '&' } } } }, 'mappings':{ 'attribute':{ 'properties':{ 'attribute_name':{ 'type': 'string', 'index': 'not_analyzed' }, 'attribute_value':{ 'type': 'string', 'analyzer': 'my_analyzer' }, 'date':{ 'type': 'string', 'index': 'not_analyzed' }, 'user':{ 'type': 'string', 'index': 'not_analyzed' } } } } } flag = es.indices.exists(index=attribute_index_name) if flag: es.indices.delete(index=attribute_index_name) es.indices.create( index = attribute_index_name, body = index_info, ignore = 400 ) #test es.index(index=attribute_index_name, doc_type=attribute_index_type, id='test_tag', body={'attribute_name':'test_tag', 'attribute_value':'tag1&tag2', 'date':'2013-09-08', 'user':'******'})
def main(): index_info = { 'settings': { 'analysis': { 'analyzer': { 'my_analyzer': { 'type': 'pattern', 'pattern': '&' } } } }, 'mappings': { 'text': { 'properties': { 'text': { 'type': 'string', 'index': 'not_analyzed' }, 'mid': { 'type': 'string', 'index': 'not_analyzed' }, 'ip': { 'type': 'string', 'index': 'not_analyzed' }, 'timestamp': { 'type': 'long' }, 'sentiment': { 'type': 'string', 'index': 'not_analyzed' }, 'geo': { 'type': 'string', 'index': 'not_analyzed' }, 'message_type': { 'type': 'string', 'index': 'not_analyzed' }, 'uid': { 'type': 'string', 'index': 'not_analyzed' }, 'hashtag': { 'type': 'string', 'analyzer': 'my_analyzer' }, 'sensitive_word': { 'type': 'string', 'analyzer': 'my_analyzer' } } } } } es.indices.create(index='monitor_user_text', body=index_info, ignore=400) es.index(index='monitor_user_text', doc_type='text', id='test', body={'uid': 'test'})
def main(): index_info = { 'settings':{ 'analysis':{ 'analyzer':{ 'my_analyzer':{ 'type': 'pattern', 'pattern': '&' } } } }, 'mappings':{ 'text':{ 'properties':{ 'text':{ 'type': 'string', 'index': 'not_analyzed' }, 'mid':{ 'type': 'string', 'index': 'not_analyzed' }, 'ip':{ 'type': 'string', 'index': 'not_analyzed' }, 'timestamp':{ 'type': 'long' }, 'sentiment':{ 'type': 'string', 'index': 'not_analyzed' }, 'geo':{ 'type': 'string', 'index': 'not_analyzed' }, 'message_type':{ 'type': 'string', 'index': 'not_analyzed' }, 'uid':{ 'type': 'string', 'index': 'not_analyzed' }, 'hashtag':{ 'type': 'string', 'analyzer': 'my_analyzer' }, 'sensitive_word':{ 'type': 'string', 'analyzer': 'my_analyzer' } } } } } es.indices.create(index='monitor_user_text', body=index_info, ignore=400) es.index(index='monitor_user_text', doc_type='text', id='test', body={'uid': 'test'})