def user_sort_interface(username , time ,sort_scope , sort_norm , arg = None, st = None, et = None, isall = False, task_number=0, number=100): task_number = int(task_number) print "user_interface:", number user_list = [] if isall: #deal with the situation of all net user if sort_scope == 'all_limit_keyword': #offline job #add job to es index during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count'] if running_number > task_number-1: return "more than limit" search_id = add_task( username ,"keyword" , "all" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall, number) #deal with the offline task return {"flag":True , "search_id" : search_id } elif sort_scope == 'all_nolimit': #online job print "all_sort, ", number user_list = all_sort_filter(None,sort_norm,time,False,number) else: if sort_scope == 'in_limit_keyword': #offline job #deal with the offline task during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count'] if running_number > task_number-1: return "more than limit" search_id = add_task( username ,"keyword" , "in" ,'flow_text_' , during , st ,et , arg , sort_norm , sort_scope, time, isall, number) return {"flag":True , "search_id" : search_id } elif sort_scope == 'in_limit_hashtag': during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count'] if running_number > task_number-1: return "more than limit" search_id = add_task( username ,"hashtag" , "in" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall, number) return {"flag":True , "search_id" : search_id } else: #find the scope user_list = in_sort_filter(time , sort_norm,sort_scope , arg,[], False, number) result = make_up_user_info(user_list,isall , time , sort_norm) print "user_list:", len(user_list) return result
def get_user_count(): count = 0 query_body = {'query': {'match_all': {}}} count = es_user_portrait.count(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['count'] return count
def weibo_sort_interface(username , time, sort_scope, sort_norm, arg, st, et, task_number, number): task_number = int(task_number) print "user_interface:", number weibo_list = [] during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 query_body = { "query":{ "terms":{ "status": [0, -1] } } } if sort_scope == 'all_limit_keyword': running_number = es_weibo_portrait.count(index=WEIBO_RANK_KEYWORD_TASK_INDEX, doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE, body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, type="keyword", during=during, st=st, et=et, arg=arg, sort_norm=sort_norm, sort_scope=sort_scope, time=time, number=number) #deal with the offline task return {"flag": True , "search_id": search_id} elif sort_scope == 'all_nolimit': pass return weibo_list
def get_user_count(): count = 0 query_body = { 'query':{ 'match_all':{} } } count = es.count(index=index_name, doc_type=index_type, body=query_body)['count'] #print 'all user count:', count return count
def get_index_rank(index_value, index_name): result = 0 query_body = { 'query':{ 'range':{ index_name:{ 'from':index_value, 'to': 100000 } } } } index_rank = es.count(index='user_portrait', doc_type='user', body=query_body) if index_rank['_shards']['successful'] != 0: result = index_rank['count'] else: print 'es index rank error' results = 0 return result
def get_influence_top_count(top_threshold, user_count): query_body = { 'query':{ 'filtered':{ 'query':{ 'match_all':{} }, 'filter':{ 'range':{ 'influence':{ 'gte': top_threshold, 'lt': 3000 } } } } } } result = es.count(index='user_portrait', doc_type='user', body=query_body)['count'] #print 'result:', result return {'top_influence_ratio':float(result)/user_count}
def get_influence_top_count(top_threshold, user_count): query_body = { 'query': { 'filtered': { 'query': { 'match_all': {} }, 'filter': { 'range': { 'influence': { 'gte': top_threshold, 'lt': 3000 } } } } } } result = es_user_portrait.count(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['count'] return {'top_influence_ratio': float(result) / user_count}
def weibo_sort_interface(username, time, sort_scope, sort_norm, arg, st, et, task_number, number): task_number = int(task_number) print "user_interface:", number weibo_list = [] during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 query_body = {"query": {"terms": {"status": [0, -1]}}} if sort_scope == 'all_limit_keyword': running_number = es_weibo_portrait.count( index=WEIBO_RANK_KEYWORD_TASK_INDEX, doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE, body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, type="keyword", during=during, st=st, et=et, arg=arg, sort_norm=sort_norm, sort_scope=sort_scope, time=time, number=number) #deal with the offline task return {"flag": True, "search_id": search_id} elif sort_scope == 'all_nolimit': pass return weibo_list
def user_sort_interface(username, time, sort_scope, sort_norm, arg=None, st=None, et=None, isall=False, task_number=0, number=100): task_number = int(task_number) print "user_interface:", number user_list = [] if isall: #deal with the situation of all net user if sort_scope == 'all_limit_keyword': #offline job #add job to es index during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 7 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count( index='user_rank_keyword_task', doc_type='user_rank_task', body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, "keyword", "all", 'flow_text_', during, st, et, arg, sort_norm, sort_scope, time, isall, number) #deal with the offline task return {"flag": True, "search_id": search_id} elif sort_scope == 'all_nolimit': #online job print "all_sort, ", number user_list = all_sort_filter(None, sort_norm, time, False, number) else: if sort_scope == 'in_limit_keyword': #offline job #deal with the offline task during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 7 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count( index='user_rank_keyword_task', doc_type='user_rank_task', body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, "keyword", "in", 'flow_text_', during, st, et, arg, sort_norm, sort_scope, time, isall, number) return {"flag": True, "search_id": search_id} elif sort_scope == 'in_limit_hashtag': during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 7 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count( index='user_rank_keyword_task', doc_type='user_rank_task', body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, "hashtag", "in", 'flow_text_', during, st, et, arg, sort_norm, sort_scope, time, isall, number) return {"flag": True, "search_id": search_id} else: #find the scope user_list = in_sort_filter(time, sort_norm, sort_scope, arg, [], False, number) result = make_up_user_info(user_list, isall, time, sort_norm) print "user_list:", len(user_list) return result
def get_scan_results_v2(): result_dict = dict() # gender ratio count query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "gender" } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] gender_result = dict() for item in search_results: gender_result[str(item['key'])] = item['doc_count'] count = sum(gender_result.values()) if count == 0: gender_ratio = {'1': 0.5, '2': 0.5} else: gender_ratio = { '1': float(gender_result['1']) / count, '2': float(gender_result['2']) / count } result_dict['gender_ratio'] = json.dumps(gender_ratio) # verified ratio count query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "verified" } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] verified_result = dict() for item in search_results: verified_result[item['key']] = item['doc_count'] count = sum(verified_result.values()) if count == 0: verified_ratio = {'yes': 0.5, 'no': 0.5} else: verified_ratio = { 'yes': float(verified_result['']) / count, 'no': float(verified_result['unknown']) / count } result_dict['verified_ratio'] = json.dumps(verified_ratio) # loation top query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "location", "size": 5 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): location_top = [] for item in search_results: location_top.append([item['key'], item['doc_count']]) else: location_top = {} result_dict['location_top'] = json.dumps(location_top) # activity geo query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "activity_geo_aggs", "size": 50 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): activity_geo_top = [] for item in search_results: activity_geo_top.append([item['key'], item['doc_count']]) else: activity_geo_top = {} activity_geo_top = sorted(activity_geo_top, key=lambda activity_geo_top: activity_geo_top[1], reverse=True) print "activity_geo_top:", activity_geo_top result_dict['activity_geo_top'] = json.dumps(activity_geo_top) # keywords query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "keywords_string", "size": 50 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): keywords_top = [] for item in search_results: keywords_top.append([item['key'], item['doc_count']]) else: keywords_top = {} result_dict['keywords_top'] = json.dumps(keywords_top) # hashtag top query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "hashtag", "size": 50 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): hashtag_top = [] for item in search_results: hashtag_top.append([item['key'], item['doc_count']]) else: hashtag_top = {} hashtag_top = sorted(hashtag_top, key=lambda hashtag_top: hashtag_top[1], reverse=True) result_dict['hashtag_top'] = json.dumps(hashtag_top) # topic top query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "topic_string", "size": 50 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): topic_top = [] for item in search_results: topic_top.append([item['key'], item['doc_count']]) else: topic_top = {} result_dict['topic_top'] = json.dumps(topic_top) # domain top query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "domain", "size": 20 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): domain_top = [] for item in search_results: domain_top.append([item['key'], item['doc_count']]) else: domain_top = {} result_dict['domain_top'] = json.dumps(domain_top) result_dict['domain_top_user'] = json.dumps( get_domain_top_user(domain_top)) result_dict['topic_top_user'] = json.dumps(get_topic_top_user(topic_top)) # online pattern top query_body = { "query": { "match_all": {} }, "aggs": { "all_interests": { "terms": { "field": "online_pattern_aggs", "size": 50 } } } } search_results = es_user_portrait.search( index=portrait_index_name, doc_type=portrait_index_type, body=query_body)["aggregations"]['all_interests']['buckets'] if len(search_results): online_pattern_top = [] for item in search_results: online_pattern_top.append([item['key'], item['doc_count']]) else: online_pattern_top = {} result_dict['online_pattern_top'] = json.dumps(online_pattern_top) # activity_count no_activity_count = es_user_portrait.count(index=portrait_index_name, doc_type=portrait_index_type, \ body={'query':{'filtered':{'filter':{'term':{'influence': 0}}}}})['count'] all_count = es_user_portrait.count(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'query':{'match_all':{}}})['count'] result_dict['activity_count'] = 1 - float(no_activity_count) / all_count return result_dict
def get_scan_results(): result_dict = {} gender_result = {'1': 0, '2': 0} verified_result = {'yes': 0, 'no': 0} location_result = {} activity_geo_result = {} keywords_result = {} hashtag_result = {} topic_result = {} online_pattern_result = {} domain_result = {} no_gender_count = 0 no_verified_count = 0 no_location_count = 0 no_activity_geo_count = 0 no_keywords_count = 0 no_hashtag_count = 0 no_topic_count = 0 no_online_pattern_count = 0 no_domain_count = 0 s_re = scan(es_user_portrait, query={ 'query': { 'match_all': {} }, 'size': 100 }, index=portrait_index_name, doc_type=portrait_index_type) print 's_re:', s_re activity_count = 0 while True: portrait_uid_list = [] while True: try: scan_re = s_re.next()['_source'] # gender ratio count portrait_uid_list.append(scan_re['uid']) try: gender_result[str(scan_re['gender'])] += 1 except: no_gender_count += 1 # verified ratio count try: verified_result[str(scan_re['verified'])] += 1 except: no_verified_count += 1 # loation top try: location = scan_re['location'] if len(location.split(' ')) > 1: location = location.split(' ')[0] try: location_result[location] += 1 except: location_result[location] = 1 except: no_location_count += 1 # activity geo try: activity_geo = scan_re['activity_geo_dict'] if activity_geo: activity_geo_dict = json.loads(activity_geo)[-1] for geo in activity_geo_dict: geo_list = geo.split('\t') if geo_list[0] == u'中国' and len(geo_list) >= 2: province = geo_list[1] try: activity_geo_result[ province] += activity_geo_dict[geo] except: activity_geo_result[ province] = activity_geo_dict[geo] except: no_activity_geo_count += 1 # keywords try: keywords = json.loads(scan_re['keywords']) if keywords: for word in keywords: try: keywords_result[word] += keywords[word] except: keywords_result[word] = keywords[word] except: no_keywords_count += 1 # hashtag top try: hashtag_dict = json.loads(scan_re['hashtag_dict']) if hashtag_dict: for tag in hashtag_dict: try: hashtag_result[tag] += hashtag_dict[tag] except: hashtag_result[tag] = hashtag_dict[tag] except: no_hashtag_count += 1 # topic top try: topic = scan_re['topic_string'] if topic: topic_list = topic.split('&') for item in topic_list: try: topic_result[item] += 1 except: topic_result[item] = 1 except: no_topic_count += 1 # online pattern top try: online_pattern = json.loads(scan_re['online_pattern']) if online_pattern: for item in online_pattern: try: online_pattern_result[item] += online_pattern[ item] except: online_pattern_result[item] = online_pattern[ item] except: no_online_pattern_count += 1 # domain top try: domain = scan_re['domain'] if domain: try: domain_result[domain] += 1 except: domain_result[domain] = 1 except: no_domain_count += 1 except StopIteration: print 'all done' now_ts = time.time() now_date = ts2datetime(now_ts - DAY) index_time = ''.join(now_date.split('-')) #test index_time = '20130907' # gender ratio count #count = sum(gender_result.values()) all_count = es_user_portrait.count(index=portrait_index_name,doc_type=portrait_index_type,\ body={'query':{'match_all':{}}})['count'] count = all_count print "count:", count gender_ratio = { '1': float(gender_result['1']) / count, '2': float(gender_result['2']) / count } #print 'gender ratio:', gender_ratio activity_result = es_user_portrait.mget( index='bci_' + index_time, doc_type='bci', body={'ids': portrait_uid_list})['docs'] for activity_item in activity_result: if activity_item['found']: activity_count += 1 #print 'activity_count:', activity_count result_dict['activity_count'] = float(activity_count) / count result_dict['gender_ratio'] = json.dumps(gender_ratio) # verified ratio count count = sum(verified_result.values()) if count == 0: verified_ratio = {'yes': 0.5, 'no': 0.5} else: verified_ratio = { 'yes': float(verified_result['yes']) / count, 'no': float(verified_result['no']) / count } #print 'verified ratio:', verified_ratio result_dict['verified_ratio'] = json.dumps(verified_ratio) # location top if location_result: sort_location = sorted(location_result.items(), key=lambda x: x[1], reverse=True) location_top = sort_location[:5] else: location_top = {} #print 'location top:', location_top result_dict['location_top'] = json.dumps(location_top) # activity geo top if activity_geo_result: sort_activity_geo = sorted(activity_geo_result.items(), key=lambda x: x[1], reverse=True) activity_geo_top = sort_activity_geo[:50] else: activity_geo_top = {} #print 'activity_geo_top:', activity_geo_top result_dict['activity_geo_top'] = json.dumps(activity_geo_top) # keywords top if keywords_result: sort_keywords = sorted(keywords_result.items(), key=lambda x: x[1], reverse=True) keywords_top = sort_keywords[:50] else: keywords_top = {} #print 'keywords_top:', keywords_top result_dict['keywords_top'] = json.dumps(keywords_top) # hashtag top if hashtag_result: sort_hashtag = sorted(hashtag_result.items(), key=lambda x: x[1], reverse=True) hashtag_top = sort_hashtag[:50] else: hashtag_top = {} #print 'hashtag top:', hashtag_top result_dict['hashtag_top'] = json.dumps(hashtag_top) # topic top if topic_result: sort_topic = sorted(topic_result.items(), key=lambda x: x[1], reverse=True) topic_top = sort_topic[:50] else: topic_top = {} #print 'topic top:', topic_top result_dict['topic_top'] = json.dumps(topic_top) # online_pattern top if online_pattern_result: sort_online_pattern = sorted(online_pattern_result.items(), key=lambda x: x[1], reverse=True) online_pattern_top = sort_online_pattern[:50] else: online_pattern_top = {} #print 'online pattern top:', online_pattern_top result_dict['online_pattern_top'] = json.dumps( online_pattern_top) # domain top if domain_result: sort_domain = sorted(domain_result.items(), key=lambda x: x[1], reverse=True) domain_top = sort_domain[:20] else: domain_top = {} result_dict['domain_top'] = json.dumps(domain_top) result_dict['domain_top_user'] = json.dumps( get_domain_top_user(domain_top)) result_dict['topic_top_user'] = json.dumps( get_topic_top_user(topic_top)) return result_dict except Exception, r: print Exception, r return result_dict activity_result = es.mget(index='20130907', doc_type='bci', body={'ids': portrait_uid_list})['docs'] for activity_item in activity_result: if activity_item['found']: activity_count += 1