예제 #1
0
def update_topic(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    tw_flow_text_index_list = get_twitter_flow_text_index_list(
        load_timestamp(), TEST_MAX_FLOW_TEXT_DAYS)
    user_topic_data = get_filter_keywords(tw_flow_text_index_list, uid_list)
    user_topic_dict, user_topic_list = topic_classfiy(uid_list,
                                                      user_topic_data)

    user_topic_string = {}
    for uid, topic_list in user_topic_list.items():
        li = []
        for t in topic_list:
            li.append(zh_data[name_list.index(t)].decode('utf8'))
        user_topic_string[uid] = '&'.join(li)
    user_topic = {}
    for uid in uid_list:
        if uid in user_topic_dict:
            user_topic[uid] = {
                'filter_keywords': json.dumps(user_topic_data[uid]),
                'topic': json.dumps(user_topic_dict[uid]),
                'topic_string': user_topic_string[uid]
            }
        else:
            user_topic[uid] = {
                'filter_keywords': json.dumps({}),
                'topic': json.dumps({}),
                'topic_string': ''
            }
    return save_data2es(user_topic)
예제 #2
0
def match_flow_text(current_date):
    '''
    #mapping有误,暂时不创建及存储数据,2018-4-13 17:11:51
    new_xnr_flow_text_index_name = new_xnr_flow_text_index_name_pre + current_date
    new_weibo_xnr_flow_text_mappings(new_xnr_flow_text_index_name)
    '''

    flow_text_index_name = new_fb_xnr_flow_text_index_name_pre + current_date

    query_body = {'query': {'match_all': {}}, 'size': MAX_VALUE}
    try:
        search_results = es_xnr.search(index=fb_xnr_index_name,doc_type=fb_xnr_index_type,\
                            body=query_body)['hits']['hits']
        bulk_action = []
        uid_list = []
        xnr_user_no_list = []
        count = 0
        for result in search_results:
            result = result['_source']
            if result.has_key('uid'):
                uid_list.append(result['uid'])
                xnr_user_no_list.append(result['xnr_user_no'])

        # print uid_list
        # print xnr_user_no_list
        #filter_keywords = {uid1:{mid1:{k:v, ...}...}...}
        filter_keywords = get_filter_keywords_for_match_function(
            [flow_text_index_name], uid_list)
        for uid, content in filter_keywords.items():
            mid_list = []
            mid_weibo = {}  #({mid1:{'key1':f1,'key2':f2...}...})
            for mid, keywords_dict in content.items():
                for k, v in keywords_dict.items():
                    keywords_dict[k.encode('utf-8', 'ignore')] = v
                mid_list.append(mid)
                mid_weibo[mid] = keywords_dict

            #mid_topic_dict   {mid1:{'art':0.1,'social':0.2...}...}
            #mid_topic_list   {mid1:['art','social','media']...}
            mid_topic_dict, mid_topic_list = topic_classfiy(
                mid_list, mid_weibo)
            for mid, topic_dict in mid_topic_dict.items():
                match_item = {
                    'topic_field_first':
                    topic_en2ch_dict[mid_topic_list[mid][0]],
                    'topic_field': '&'.join(mid_topic_list[mid]),
                    'xnr_user_no': xnr_user_no_list[uid_list.index(uid)],
                }
                action = {'update': {'_id': mid}}
                bulk_action.extend([action, {'doc': match_item}])

        if bulk_action:
            es_xnr.bulk(bulk_action,
                        index=flow_text_index_name,
                        doc_type=new_fb_xnr_flow_text_index_type,
                        timeout=600)

    except Exception, e:
        #print e
        return 'no tweets to update today'
예제 #3
0
def my_topic_classfiy(uid_list, datetime_list):
    topic_dict_results = {}
    topic_string_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'): 
            found = r['found']
            if found and r['_source'].has_key('topic'):
                topic = r['_source']['topic']
                topic_string = r['_source']['topic_string']
                topic_dict_results[uid] = json.loads(topic)
                topic_string_results[uid] = [topic_ch2en_dict[ch_topic] for ch_topic in topic_string.split('&')]
            else:
                unresolved_uids.append(uid)
        else:   #es表中目前无任何记录 
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_topic_dict = {}
    user_topic_list = {}
    if unresolved_uids:
        tw_flow_text_index_list = []
        for datetime in datetime_list:
            tw_flow_text_index_list.append(flow_text_index_name_pre + datetime)
        user_topic_data = get_filter_keywords(tw_flow_text_index_list, unresolved_uids)
        user_topic_dict, user_topic_list = topic_classfiy(unresolved_uids, user_topic_data)

        user_topic_string = {}
        for uid, topic_list in user_topic_list.items():
            li = []
            for t in topic_list:
                li.append(zh_data[name_list.index(t)].decode('utf8'))
            user_topic_string[uid] = '&'.join(li)
        user_topic = {}
        for uid in unresolved_uids:
            if uid in user_topic_dict:
                user_topic[uid] = {
                    'filter_keywords': json.dumps(user_topic_data[uid]),
                    'topic': json.dumps(user_topic_dict[uid]),
                    'topic_string': user_topic_string[uid]
                }
            else:
                user_topic[uid] = {
                    'filter_keywords': json.dumps({}),
                    'topic': json.dumps({}),
                    'topic_string': ''
                }
        save_data2es(user_topic)

    #整合
    user_topic_dict.update(topic_dict_results)
    user_topic_list.update(topic_string_results)
    return user_topic_dict, user_topic_list
예제 #4
0
def input_data():  #测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file(abs_path + '/weibo_data/uid_text_0728.csv', 'rb'))
    for mid, w_text in reader:
        v = re_cut(w_text.decode('utf-8'))
        words = sw.participle(v.encode('utf-8'))
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (
                    word[0] not in black_word
            ) and (word[0]
                   not in single_word_whitelist):  #选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_list.append(mid)
        uid_weibo[mid] = word_list

    return uid_list, uid_weibo


if __name__ == '__main__':

    uid_list, uid_weibo = input_data()
    uid_topic = topic_classfiy(uid_list, uid_weibo)
    print uid_topic
예제 #5
0
import csv
from config import load_scws,cx_dict,single_word_whitelist,black_word,abs_path,re_cut
from test_topic import topic_classfiy

def input_data():#测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file(abs_path+'/weibo_data/uid_text_0728.csv', 'rb'))
    for mid,w_text in reader:
        v = re_cut(w_text.decode('utf-8'))
        words = sw.participle(v.encode('utf-8'))
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_list.append(mid)
        uid_weibo[mid] = word_list
    
    return uid_list,uid_weibo

if __name__ == '__main__':

    uid_list,uid_weibo = input_data()
    uid_topic = topic_classfiy(uid_list,uid_weibo)
    print uid_topic