def user_sort_interface(username , time ,sort_scope , sort_norm , arg = None, st = None, et = None, isall = False, task_number=0, number=100): task_number = int(task_number) print "user_interface:", number user_list = [] if isall: #deal with the situation of all net user if sort_scope == 'all_limit_keyword': #offline job #add job to es index during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count'] if running_number > task_number-1: return "more than limit" search_id = add_task( username ,"keyword" , "all" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall, number) #deal with the offline task return {"flag":True , "search_id" : search_id } elif sort_scope == 'all_nolimit': #online job print "all_sort, ", number user_list = all_sort_filter(None,sort_norm,time,False,number) else: if sort_scope == 'in_limit_keyword': #offline job #deal with the offline task during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count'] if running_number > task_number-1: return "more than limit" search_id = add_task( username ,"keyword" , "in" ,'flow_text_' , during , st ,et , arg , sort_norm , sort_scope, time, isall, number) return {"flag":True , "search_id" : search_id } elif sort_scope == 'in_limit_hashtag': during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count'] if running_number > task_number-1: return "more than limit" search_id = add_task( username ,"hashtag" , "in" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall, number) return {"flag":True , "search_id" : search_id } else: #find the scope user_list = in_sort_filter(time , sort_norm,sort_scope , arg,[], False, number) result = make_up_user_info(user_list,isall , time , sort_norm) print "user_list:", len(user_list) return result
def user_sort_interface(username , time ,sort_scope , sort_norm , arg = None, st = None, et = None, isall = False): user_list = [] if isall: #deal with the situation of all net user if sort_scope == 'all_limit_keyword': #offline job #add job to es index during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 search_id = add_task( username ,"keyword" , "all" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall) #deal with the offline task return {"flag":True , "search_id" : search_id } elif sort_scope == 'all_nolimit': #online job user_list = all_sort_filter(None,sort_norm,time) else: if sort_scope == 'in_limit_keyword': #offline job #deal with the offline task during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 search_id = add_task( username ,"keyword" , "in" ,'flow_text_' , during , st ,et , arg , sort_norm , sort_scope, time, isall) return {"flag":True , "search_id" : search_id } elif sort_scope == 'in_limit_hashtag': during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1 time = 1 if during > 3: time = 7 elif during > 16: time = 30 search_id = add_task( username ,"hashtag" , "in" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall) return {"flag":True , "search_id" : search_id } else: #find the scope user_list = in_sort_filter(time , sort_norm,sort_scope , arg) result = make_up_user_info(user_list,isall , time , sort_norm) return result
def user_sort_interface(username, time, sort_scope, sort_norm, arg=None, st=None, et=None, isall=False, task_number=0, number=100): task_number = int(task_number) print "user_interface:", number user_list = [] if isall: #deal with the situation of all net user if sort_scope == 'all_limit_keyword': #offline job #add job to es index during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 7 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count( index='user_rank_keyword_task', doc_type='user_rank_task', body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, "keyword", "all", 'flow_text_', during, st, et, arg, sort_norm, sort_scope, time, isall, number) #deal with the offline task return {"flag": True, "search_id": search_id} elif sort_scope == 'all_nolimit': #online job print "all_sort, ", number user_list = all_sort_filter(None, sort_norm, time, False, number) else: if sort_scope == 'in_limit_keyword': #offline job #deal with the offline task during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 7 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count( index='user_rank_keyword_task', doc_type='user_rank_task', body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, "keyword", "in", 'flow_text_', during, st, et, arg, sort_norm, sort_scope, time, isall, number) return {"flag": True, "search_id": search_id} elif sort_scope == 'in_limit_hashtag': during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1 time = 7 if during > 3: time = 7 elif during > 16: time = 30 running_number = es_user_portrait.count( index='user_rank_keyword_task', doc_type='user_rank_task', body=query_body)['count'] if running_number > task_number - 1: return "more than limit" search_id = add_task(username, "hashtag", "in", 'flow_text_', during, st, et, arg, sort_norm, sort_scope, time, isall, number) return {"flag": True, "search_id": search_id} else: #find the scope user_list = in_sort_filter(time, sort_norm, sort_scope, arg, [], False, number) result = make_up_user_info(user_list, isall, time, sort_norm) print "user_list:", len(user_list) return result
def key_words_search(task_id, search_type, pre, during, start_time, keyword_list, search_key='', sort_norm='', sort_scope='', time=1, isall=False, number=100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix": {"text": "#" + key + "#"}}) else: should.append({"wildcard": {"text": "*" + key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] query_body = { "query": { "bool": { "must": should } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 5000 } results = es_flow_text.search(index=index_list, doc_type='text', body=query_body, _source=False, fields=[ "uid", "user_fansnum", "text", "message_type", "sentiment", "timestamp", "geo", "retweeted", "comment" ])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results: if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list: # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids": un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([ results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, portrait_list, True, number) # sort elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([ item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True, number) print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list, isall, time, sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(text_results) item['number'] = len(results) es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=item) return "1"
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
for item in result : in_set.add(item['_id'].encode("utf-8") ) result_list = list(uid_set & in_set) except Exception,e: print e raise Exception('user_list failed!') else : result_list = list(uid_set) if not isall: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , result_list , True) else: uid_list = all_sort_filter(result_list , sort_norm , time ,True) results = make_up_user_info(uid_list,isall,time,sort_norm) query = {"query":{"bool":{"must":[{"term":{"user_rank_task.user_ts":search_key}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"facets":{}} if True: result = es_9200.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE , body = query)['hits']['hits'] search_id = result[0]['_id'] item = result[0]['_source'] item['status'] = 1 item['result'] = json.dumps(results) es_9200.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=search_id, body=item) return results def scan_offlice_task():
for item in result: in_set.add(item['_id'].encode("utf-8")) result_list = list(uid_set & in_set) except Exception, e: print e raise Exception('user_list failed!') else: result_list = list(uid_set) if not isall: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, result_list, True) else: uid_list = all_sort_filter(result_list, sort_norm, time, True) results = make_up_user_info(uid_list, isall, time, sort_norm) query = { "query": { "bool": { "must": [{ "term": { "user_rank_task.user_ts": search_key } }], "must_not": [], "should": [] } }, "from": 0, "size": 10,