def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment( uid_list) else: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text( uid_list) #compute attribute--domain, character, importance #get user domain domain_results = domain_classfiy(uid_list, user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] #get user character character_start_time = ts2datetime(character_start_ts) character_end_time = ts2datetime(character_start_ts + DAY * CHARACTER_TIME_GAP - DAY) character_sentiment_result_dict = classify_sentiment( uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE) character_text_result_dict = classify_topic(uid_list, user_keywords_dict) bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid #add user domain attribute user_domain_dict = domain_results_dict[uid] user_label_dict = domain_results_label[uid] results['domain_v3'] = json.dumps(user_domain_dict) results['domain'] = domain_en2ch(user_label_dict) #add user character_sentiment attribute character_sentiment = character_sentiment_result_dict[uid] results['character_sentiment'] = character_sentiment #add user character_text attribute character_text = character_text_result_dict[uid] results['character_text'] = character_text #get user importance user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importnace'] = get_importance(results['domain'], user_topic_string, user_fansnum, fansnum_max) #bulk action action = {'update': {'_id': uid}} bulk_action.extend([action, {'doc': results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment(uid_list) else: user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text(uid_list) #compute attribute--domain, character, importance #get user domain domain_results = domain_classfiy(uid_list, user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] #get user character character_end_time = ts2datetime(character_start_ts) character_start_time = ts2datetime(character_start_ts - DAY * CHARACTER_TIME_GAP) character_sentiment_result_dict = classify_sentiment(uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE) character_text_result_dict = classify_topic(uid_list, user_keywords_dict) bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid #add user domain attribute user_domain_dict = domain_results_dict[uid] user_label_dict = domain_results_label[uid] results['domain_v3'] = json.dumps(user_domain_dict) results['domain'] = domain_en2ch(user_label_dict) #add user character_sentiment attribute character_sentiment = character_sentiment_result_dict[uid] results['character_sentiment'] = character_sentiment #add user character_text attribute character_text = character_text_result_dict[uid] results['character_text'] = character_text #get user importance user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importnace'] = get_importance(results['domain'], user_topic_string, user_fansnum, fansnum_max) #bulk action action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': results}]) es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
def deal_bulk_action(user_info_list, fansnum_max): start_ts = time.time() uid_list = user_info_list.keys() #acquire bulk user weibo data if WEIBO_API_INPUT_TYPE == 0: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(uid_list) else: user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(uid_list) #compute attribute--keywords, topic, online_pattern #get user topic results by bulk action topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict) domain_results = domain_classfiy(uid_list, user_keywords_dict) politics_results = political_classify(uid_list, user_keywords_dict) #update school attribute---is_school/school_string/school_dict #school_results_dict = get_school(uid_list) #get bulk action bulk_action = [] for uid in uid_list: results = {} results['uid'] = uid #results['is_school'] = school_results_dict[uid]['is_school'] #results['school_string'] = school_results_dict[uid]['school_string'] #results['school_dict'] = school_results_dict[uid]['school_dict'] #print 'is_school, school_string, school_dict:', results['is_school'],type(results['is_school']) ,results['school_string'],type(results['school_string']), results['school_dict'], type(results['school_dict']) #add user topic attribute user_topic_dict = topic_results_dict[uid] user_label_dict = topic_results_label[uid] results['topic'] = json.dumps(user_topic_dict) results['topic_string'] = topic_en2ch(user_label_dict) #add user domain attribute user_domain_dict = domain_results[uid] domain_list = domain_en2ch(user_domain_dict) if domain_list: results['domain_list'] = json.dumps(domain_list) results['domain'] = domain_list[0] else: results['domain'] = "其他" results['domain_list'] = json.dumps(["其他"]) politics_label = politics_results[uid] results['politics'] = politics_en2ch(politics_label) #add user keywords attribute try: keywords_dict = user_keywords_dict[uid] except: keywords_dict = {} keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results['keywords'] = json.dumps(keywords_top50) results['keywords_string'] = keywords_top50_string #add online_pattern try: user_online_pattern = json.dumps(online_pattern_dict[uid]) except: user_online_pattern = json.dumps({}) results['online_pattern'] = user_online_pattern try: results['online_pattern_aggs'] = '&'.join(user_online_pattern.keys()) except: results['online_pattern_aggs'] = '' #add user importance user_domain = user_info_list[uid]['domain'].encode('utf-8') user_fansnum = user_info_list[uid]['fansnum'] results['importance'] = get_importance(user_domain, results['topic_string'], user_fansnum, fansnum_max) # politics politics_label = politics_results[user] results['politics'] = politics_en2ch(politics_label) #bulk action action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': results}]) print 'bulk_action:', bulk_action #es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) end_ts = time.time() #log_should_delete #print '%s sec count %s' % (end_ts - start_ts, len(uid_list)) #log_should_delete start_ts = end_ts