def update_topic(uid_list=[]): if not uid_list: uid_list = load_uid_list() tw_flow_text_index_list = get_twitter_flow_text_index_list( load_timestamp(), TEST_MAX_FLOW_TEXT_DAYS) user_topic_data = get_filter_keywords(tw_flow_text_index_list, uid_list) user_topic_dict, user_topic_list = topic_classfiy(uid_list, user_topic_data) user_topic_string = {} for uid, topic_list in user_topic_list.items(): li = [] for t in topic_list: li.append(zh_data[name_list.index(t)].decode('utf8')) user_topic_string[uid] = '&'.join(li) user_topic = {} for uid in uid_list: if uid in user_topic_dict: user_topic[uid] = { 'filter_keywords': json.dumps(user_topic_data[uid]), 'topic': json.dumps(user_topic_dict[uid]), 'topic_string': user_topic_string[uid] } else: user_topic[uid] = { 'filter_keywords': json.dumps({}), 'topic': json.dumps({}), 'topic_string': '' } return save_data2es(user_topic)
def match_flow_text(current_date): ''' #mapping有误,暂时不创建及存储数据,2018-4-13 17:11:51 new_xnr_flow_text_index_name = new_xnr_flow_text_index_name_pre + current_date new_weibo_xnr_flow_text_mappings(new_xnr_flow_text_index_name) ''' flow_text_index_name = new_fb_xnr_flow_text_index_name_pre + current_date query_body = {'query': {'match_all': {}}, 'size': MAX_VALUE} try: search_results = es_xnr.search(index=fb_xnr_index_name,doc_type=fb_xnr_index_type,\ body=query_body)['hits']['hits'] bulk_action = [] uid_list = [] xnr_user_no_list = [] count = 0 for result in search_results: result = result['_source'] if result.has_key('uid'): uid_list.append(result['uid']) xnr_user_no_list.append(result['xnr_user_no']) # print uid_list # print xnr_user_no_list #filter_keywords = {uid1:{mid1:{k:v, ...}...}...} filter_keywords = get_filter_keywords_for_match_function( [flow_text_index_name], uid_list) for uid, content in filter_keywords.items(): mid_list = [] mid_weibo = {} #({mid1:{'key1':f1,'key2':f2...}...}) for mid, keywords_dict in content.items(): for k, v in keywords_dict.items(): keywords_dict[k.encode('utf-8', 'ignore')] = v mid_list.append(mid) mid_weibo[mid] = keywords_dict #mid_topic_dict {mid1:{'art':0.1,'social':0.2...}...} #mid_topic_list {mid1:['art','social','media']...} mid_topic_dict, mid_topic_list = topic_classfiy( mid_list, mid_weibo) for mid, topic_dict in mid_topic_dict.items(): match_item = { 'topic_field_first': topic_en2ch_dict[mid_topic_list[mid][0]], 'topic_field': '&'.join(mid_topic_list[mid]), 'xnr_user_no': xnr_user_no_list[uid_list.index(uid)], } action = {'update': {'_id': mid}} bulk_action.extend([action, {'doc': match_item}]) if bulk_action: es_xnr.bulk(bulk_action, index=flow_text_index_name, doc_type=new_fb_xnr_flow_text_index_type, timeout=600) except Exception, e: #print e return 'no tweets to update today'
def my_topic_classfiy(uid_list, datetime_list): topic_dict_results = {} topic_string_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('topic'): topic = r['_source']['topic'] topic_string = r['_source']['topic_string'] topic_dict_results[uid] = json.loads(topic) topic_string_results[uid] = [topic_ch2en_dict[ch_topic] for ch_topic in topic_string.split('&')] else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_topic_dict = {} user_topic_list = {} if unresolved_uids: tw_flow_text_index_list = [] for datetime in datetime_list: tw_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_topic_data = get_filter_keywords(tw_flow_text_index_list, unresolved_uids) user_topic_dict, user_topic_list = topic_classfiy(unresolved_uids, user_topic_data) user_topic_string = {} for uid, topic_list in user_topic_list.items(): li = [] for t in topic_list: li.append(zh_data[name_list.index(t)].decode('utf8')) user_topic_string[uid] = '&'.join(li) user_topic = {} for uid in unresolved_uids: if uid in user_topic_dict: user_topic[uid] = { 'filter_keywords': json.dumps(user_topic_data[uid]), 'topic': json.dumps(user_topic_dict[uid]), 'topic_string': user_topic_string[uid] } else: user_topic[uid] = { 'filter_keywords': json.dumps({}), 'topic': json.dumps({}), 'topic_string': '' } save_data2es(user_topic) #整合 user_topic_dict.update(topic_dict_results) user_topic_list.update(topic_string_results) return user_topic_dict, user_topic_list
def input_data(): #测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file(abs_path + '/weibo_data/uid_text_0728.csv', 'rb')) for mid, w_text in reader: v = re_cut(w_text.decode('utf-8')) words = sw.participle(v.encode('utf-8')) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and ( word[0] not in black_word ) and (word[0] not in single_word_whitelist): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_list.append(mid) uid_weibo[mid] = word_list return uid_list, uid_weibo if __name__ == '__main__': uid_list, uid_weibo = input_data() uid_topic = topic_classfiy(uid_list, uid_weibo) print uid_topic
import csv from config import load_scws,cx_dict,single_word_whitelist,black_word,abs_path,re_cut from test_topic import topic_classfiy def input_data():#测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file(abs_path+'/weibo_data/uid_text_0728.csv', 'rb')) for mid,w_text in reader: v = re_cut(w_text.decode('utf-8')) words = sw.participle(v.encode('utf-8')) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_list.append(mid) uid_weibo[mid] = word_list return uid_list,uid_weibo if __name__ == '__main__': uid_list,uid_weibo = input_data() uid_topic = topic_classfiy(uid_list,uid_weibo) print uid_topic