示例#1
0
def test_classify(f_name):

    user_info = load_user_info(f_name)  #加载用户背景信息
    print len(user_info)
    user_text = load_user_text(f_name)  #加载用户发帖信息
    print len(user_text)
    #user_interaction = load_user_interaction()#加载用户交互信息

    user_data = combine_user_info(user_info, user_text)
    print len(user_data)
    user_label = domain_main(user_data)
    print len(user_label)

    ##    keyword = dict()
    ##    for k,v in user_label.iteritems():
    ##        if (v == 'politician' or v == 'business') and not user_data[k]['category']:
    ##            bio_string = user_data[k]['bio_str']
    ##            bio_string_s = cc.convert(bio_string.decode('utf-8'))
    ##            kwdlist = cut(s, bio_string_s.encode('utf-8'))
    ##            for k in kwdlist:
    ##                try:
    ##                    keyword[k] = keyword[k] + 1
    ##                except KeyError:
    ##                    keyword[k] = 1
    ##
    ##    with open('./result/keywords.csv', 'wb') as f:
    ##        writer = csv.writer(f)
    ##        for k,v in keyword.iteritems():
    ##            writer.writerow((k,v))
    ##    f.close()

    with open('./result/user_label.csv', 'wb') as f:
        writer = csv.writer(f)
        for k, v in user_label.iteritems():
            writer.writerow(
                (k, v, user_data[k]['category'], user_data[k]['bio_str'],
                 user_data[k]['number_of_text']))
    f.close()
示例#2
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=fb_portrait_index_name,
                  doc_type=fb_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        fb_flow_text_index_list = []
        for datetime in datetime_list:
            fb_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, fb_flow_text_index_list)
        #load baseinfo
        fb_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields": ["bio_str", "category", "uid"]
        }
        try:
            search_results = es.search(index=facebook_user_index_name,
                                       doc_type=facebook_user_index_type,
                                       body=fb_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'bio_str': '',
                        'category': '',
                        'number_of_text': text_num
                    }
                #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
                if content.has_key('category'):
                    category = content.get('category')[0]
                else:
                    category = ''
                if content.has_key('bio_str'):
                    bio_str = content.get('bio_str')[0]
                else:
                    bio_str = '____'
                user_domain_data[uid]['bio_str'] = bio_str
                user_domain_data[uid]['category'] = category
        except Exception, e:
            print e
        #domain计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
示例#3
0
                location_ch = ''
            if content.has_key('description_ch'):
                description_ch = content.get('description_ch')[0][:1000]
            else:
                description_ch = ''
            if content.has_key('username'):
                username = content.get('username')[0]
            else:
                username = ''
            user_domain_data[uid]['location'] = location_ch
            user_domain_data[uid]['username'] = username
            user_domain_data[uid]['description'] = description_ch
    except Exception, e:
        print e
    #domian计算
    user_domain_temp = domain_main(user_domain_data)
    user_domain = {}
    for uid in uid_list:
        if uid in user_domain_temp:
            user_domain[uid] = {'domain': user_domain_temp[uid]}
        else:
            user_domain[uid] = {'domain': 'other'}
    return save_data2es(user_domain)


def update_topic(uid_list=[]):
    if not uid_list:
        uid_list = load_uid_list()
    tw_flow_text_index_list = get_twitter_flow_text_index_list(
        load_timestamp(), TEST_MAX_FLOW_TEXT_DAYS)
    user_topic_data = get_filter_keywords(tw_flow_text_index_list, uid_list)
示例#4
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=tw_portrait_index_name,
                  doc_type=tw_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        tw_flow_text_index_list = []
        for datetime in datetime_list:
            tw_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, tw_flow_text_index_list)
        #load baseinfo
        tw_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields": ["location", "username", "description", "uid"]
        }
        try:
            search_results = es.search(index=twitter_user_index_name,
                                       doc_type=twitter_user_index_type,
                                       body=tw_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'location': '',
                        'username': '',
                        'description': '',
                        'number_of_text': text_num
                    }
                if content.has_key('location_ch'):
                    location = content.get('location_ch')[0]
                else:
                    location = ''
                if content.has_key('description_ch'):
                    description = content.get('description_ch')[0]
                else:
                    description = ''
                if content.has_key('username'):
                    username = content.get('username')[0]
                else:
                    username = ''
                user_domain_data[uid]['location'] = location
                user_domain_data[uid]['username'] = username
                user_domain_data[uid]['description'] = description
        except Exception, e:
            print e
        #domian计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
示例#5
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=fb_portrait_index_name,
                  doc_type=fb_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        fb_flow_text_index_list = []
        for datetime in datetime_list:
            fb_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, fb_flow_text_index_list)
        #load baseinfo
        fb_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields":
            ["bio", "about", "description", "quotes", "category", "uid"]
        }
        try:
            search_results = es.search(index=facebook_user_index_name,
                                       doc_type=facebook_user_index_type,
                                       body=fb_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'bio_str': '',
                        'bio_list': [],
                        'category': '',
                        'number_of_text': text_num
                    }
                #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
                if content.has_key('category'):
                    category = content.get('category')[0]
                else:
                    category = ''
                if content.has_key('description'):
                    description = content.get(
                        'description'
                    )[0][:
                         1000]  #有的用户描述信息之类的太长了……3000+,没有卵用,而且翻译起来会出现一些问题,截取一部分就行了
                else:
                    description = ''
                if content.has_key('quotes'):
                    quotes = content.get('quotes')[0][:1000]
                else:
                    quotes = ''
                if content.has_key('bio'):
                    bio = content.get('bio')[0][:1000]
                else:
                    bio = ''
                if content.has_key('about'):
                    about = content.get('about')[0][:1000]
                else:
                    about = ''
                user_domain_data[uid]['bio_list'] = [
                    quotes, bio, about, description
                ]
                user_domain_data[uid]['category'] = category
        except Exception, e:
            print e
        #由于一个用户请求一次翻译太耗时,所以统一批量翻译
        trans_uid_list = []
        untrans_bio_data = []
        cut = 100
        n = len(user_domain_data) / cut
        for uid, content in user_domain_data.items():
            trans_uid_list.append(uid)
            untrans_bio_data.extend(content['bio_list'])
            content.pop('bio_list')
            if n:
                if len(trans_uid_list) % cut == 0:
                    temp_trans_bio_data = trans_bio_data(untrans_bio_data)
                    for i in range(len(trans_uid_list)):
                        uid = trans_uid_list[i]
                        user_domain_data[uid]['bio_str'] = '_'.join(
                            temp_trans_bio_data[4 * i:4 * i + 4])
                    trans_uid_list = []
                    untrans_bio_data = []
                    n = n - 1
            else:
                if len(trans_uid_list) == (len(user_domain_data) % cut):
                    temp_trans_bio_data = trans_bio_data(untrans_bio_data)
                    for i in range(len(trans_uid_list)):
                        uid = trans_uid_list[i]
                        user_domain_data[uid]['bio_str'] = '_'.join(
                            temp_trans_bio_data[4 * i:4 * i + 4])
                    trans_uid_list = []
                    untrans_bio_data = []
        #domian计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
示例#6
0
def test_classify(f_name):

    user_info = load_user_info(f_name)  #加载用户背景信息
    print len(user_info)
    user_text = load_user_text(f_name)  #加载用户发帖信息
    print len(user_text)
    #user_interaction = load_user_interaction()#加载用户交互信息

    user_data = combine_user_info(user_info, user_text)
    print len(user_data)
    user_label = domain_main(user_data)
    print len(user_label)

    ##    lawyer_keyword = dict()
    ##    university_keyword = dict()
    ##    media_keyword = dict()
    ##    mediaworker_keyword = dict()
    ##    admin_keyword = dict()
    ##    for k,v in user_data.iteritems():
    ##        bio_string = v['username'] + '_' + v['description']
    ##        if len(bio_string) <= 1:
    ##            continue
    ##        if k in lawyer_uid:
    ##            bio_string_s = cc.convert(bio_string.decode('utf-8'))
    ##            kwdlist = cut(s, bio_string_s.encode('utf-8'))
    ##            for w in kwdlist:
    ##                try:
    ##                    lawyer_keyword[w] = lawyer_keyword[w] + 1
    ##                except KeyError:
    ##                    lawyer_keyword[w] = 1
    ##        elif k in university_uid:
    ##            bio_string_s = cc.convert(bio_string.decode('utf-8'))
    ##            kwdlist = cut(s, bio_string_s.encode('utf-8'))
    ##            for w in kwdlist:
    ##                try:
    ##                    university_keyword[w] = university_keyword[w] + 1
    ##                except KeyError:
    ##                    university_keyword[w] = 1
    ##        elif k in media_uid:
    ##            bio_string_s = cc.convert(bio_string.decode('utf-8'))
    ##            kwdlist = cut(s, bio_string_s.encode('utf-8'))
    ##            for w in kwdlist:
    ##                try:
    ##                    media_keyword[w] = media_keyword[w] + 1
    ##                except KeyError:
    ##                    media_keyword[w] = 1
    ##        elif k in mediaworker_uid:
    ##            bio_string_s = cc.convert(bio_string.decode('utf-8'))
    ##            kwdlist = cut(s, bio_string_s.encode('utf-8'))
    ##            for w in kwdlist:
    ##                try:
    ##                    mediaworker_keyword[w] = mediaworker_keyword[w] + 1
    ##                except KeyError:
    ##                    mediaworker_keyword[w] = 1
    ##        elif k in admin_uid:
    ##            bio_string_s = cc.convert(bio_string.decode('utf-8'))
    ##            kwdlist = cut(s, bio_string_s.encode('utf-8'))
    ##            for w in kwdlist:
    ##                try:
    ##                    admin_keyword[w] = admin_keyword[w] + 1
    ##                except KeyError:
    ##                    admin_keyword[w] = 1
    ##        else:
    ##            continue
    ##
    ##    with open('./word_dict/admin_keywords.csv', 'wb') as f:
    ##        writer = csv.writer(f)
    ##        for k,v in admin_keyword.iteritems():
    ##            writer.writerow((k,v))
    ##    f.close()
    ##
    ##    with open('./word_dict/mediaworker_keywords.csv', 'wb') as f:
    ##        writer = csv.writer(f)
    ##        for k,v in mediaworker_keyword.iteritems():
    ##            writer.writerow((k,v))
    ##    f.close()
    ##
    ##    with open('./word_dict/media_keywords.csv', 'wb') as f:
    ##        writer = csv.writer(f)
    ##        for k,v in media_keyword.iteritems():
    ##            writer.writerow((k,v))
    ##    f.close()
    ##
    ##    with open('./word_dict/university_keywords.csv', 'wb') as f:
    ##        writer = csv.writer(f)
    ##        for k,v in university_keyword.iteritems():
    ##            writer.writerow((k,v))
    ##    f.close()
    ##
    ##    with open('./word_dict/lawyer_keywords.csv', 'wb') as f:
    ##        writer = csv.writer(f)
    ##        for k,v in lawyer_keyword.iteritems():
    ##            writer.writerow((k,v))
    ##    f.close()

    with open('./result/user_label.csv', 'wb') as f:
        writer = csv.writer(f)
        for k, v in user_label.iteritems():
            if k in lawyer_uid:
                writer.writerow(
                    (k, v, 'lawyer', user_data[k]['username'],
                     user_data[k]['description'], user_data[k]['location'],
                     user_data[k]['number_of_text']))
            elif k in university_uid:
                writer.writerow(
                    (k, v, 'university', user_data[k]['username'],
                     user_data[k]['description'], user_data[k]['location'],
                     user_data[k]['number_of_text']))
            elif k in media_uid:
                writer.writerow(
                    (k, v, 'media', user_data[k]['username'],
                     user_data[k]['description'], user_data[k]['location'],
                     user_data[k]['number_of_text']))
            elif k in mediaworker_uid:
                writer.writerow(
                    (k, v, 'mediaworker', user_data[k]['username'],
                     user_data[k]['description'], user_data[k]['location'],
                     user_data[k]['number_of_text']))
            elif k in admin_uid:
                writer.writerow(
                    (k, v, 'admin', user_data[k]['username'],
                     user_data[k]['description'], user_data[k]['location'],
                     user_data[k]['number_of_text']))
            else:
                writer.writerow(
                    (k, v, 'other', user_data[k]['username'],
                     user_data[k]['description'], user_data[k]['location'],
                     user_data[k]['number_of_text']))
    f.close()