def test_classify(f_name): user_info = load_user_info(f_name) #加载用户背景信息 print len(user_info) user_text = load_user_text(f_name) #加载用户发帖信息 print len(user_text) #user_interaction = load_user_interaction()#加载用户交互信息 user_data = combine_user_info(user_info, user_text) print len(user_data) user_label = domain_main(user_data) print len(user_label) ## keyword = dict() ## for k,v in user_label.iteritems(): ## if (v == 'politician' or v == 'business') and not user_data[k]['category']: ## bio_string = user_data[k]['bio_str'] ## bio_string_s = cc.convert(bio_string.decode('utf-8')) ## kwdlist = cut(s, bio_string_s.encode('utf-8')) ## for k in kwdlist: ## try: ## keyword[k] = keyword[k] + 1 ## except KeyError: ## keyword[k] = 1 ## ## with open('./result/keywords.csv', 'wb') as f: ## writer = csv.writer(f) ## for k,v in keyword.iteritems(): ## writer.writerow((k,v)) ## f.close() with open('./result/user_label.csv', 'wb') as f: writer = csv.writer(f) for k, v in user_label.iteritems(): writer.writerow( (k, v, user_data[k]['category'], user_data[k]['bio_str'], user_data[k]['number_of_text'])) f.close()
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=fb_portrait_index_name, doc_type=fb_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: fb_flow_text_index_list = [] for datetime in datetime_list: fb_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, fb_flow_text_index_list) #load baseinfo fb_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["bio_str", "category", "uid"] } try: search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=fb_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'bio_str': '', 'category': '', 'number_of_text': text_num } #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。 if content.has_key('category'): category = content.get('category')[0] else: category = '' if content.has_key('bio_str'): bio_str = content.get('bio_str')[0] else: bio_str = '____' user_domain_data[uid]['bio_str'] = bio_str user_domain_data[uid]['category'] = category except Exception, e: print e #domain计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)
location_ch = '' if content.has_key('description_ch'): description_ch = content.get('description_ch')[0][:1000] else: description_ch = '' if content.has_key('username'): username = content.get('username')[0] else: username = '' user_domain_data[uid]['location'] = location_ch user_domain_data[uid]['username'] = username user_domain_data[uid]['description'] = description_ch except Exception, e: print e #domian计算 user_domain_temp = domain_main(user_domain_data) user_domain = {} for uid in uid_list: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain[uid] = {'domain': 'other'} return save_data2es(user_domain) def update_topic(uid_list=[]): if not uid_list: uid_list = load_uid_list() tw_flow_text_index_list = get_twitter_flow_text_index_list( load_timestamp(), TEST_MAX_FLOW_TEXT_DAYS) user_topic_data = get_filter_keywords(tw_flow_text_index_list, uid_list)
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: tw_flow_text_index_list = [] for datetime in datetime_list: tw_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, tw_flow_text_index_list) #load baseinfo tw_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["location", "username", "description", "uid"] } try: search_results = es.search(index=twitter_user_index_name, doc_type=twitter_user_index_type, body=tw_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'location': '', 'username': '', 'description': '', 'number_of_text': text_num } if content.has_key('location_ch'): location = content.get('location_ch')[0] else: location = '' if content.has_key('description_ch'): description = content.get('description_ch')[0] else: description = '' if content.has_key('username'): username = content.get('username')[0] else: username = '' user_domain_data[uid]['location'] = location user_domain_data[uid]['username'] = username user_domain_data[uid]['description'] = description except Exception, e: print e #domian计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=fb_portrait_index_name, doc_type=fb_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: fb_flow_text_index_list = [] for datetime in datetime_list: fb_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, fb_flow_text_index_list) #load baseinfo fb_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["bio", "about", "description", "quotes", "category", "uid"] } try: search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=fb_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'bio_str': '', 'bio_list': [], 'category': '', 'number_of_text': text_num } #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。 if content.has_key('category'): category = content.get('category')[0] else: category = '' if content.has_key('description'): description = content.get( 'description' )[0][: 1000] #有的用户描述信息之类的太长了……3000+,没有卵用,而且翻译起来会出现一些问题,截取一部分就行了 else: description = '' if content.has_key('quotes'): quotes = content.get('quotes')[0][:1000] else: quotes = '' if content.has_key('bio'): bio = content.get('bio')[0][:1000] else: bio = '' if content.has_key('about'): about = content.get('about')[0][:1000] else: about = '' user_domain_data[uid]['bio_list'] = [ quotes, bio, about, description ] user_domain_data[uid]['category'] = category except Exception, e: print e #由于一个用户请求一次翻译太耗时,所以统一批量翻译 trans_uid_list = [] untrans_bio_data = [] cut = 100 n = len(user_domain_data) / cut for uid, content in user_domain_data.items(): trans_uid_list.append(uid) untrans_bio_data.extend(content['bio_list']) content.pop('bio_list') if n: if len(trans_uid_list) % cut == 0: temp_trans_bio_data = trans_bio_data(untrans_bio_data) for i in range(len(trans_uid_list)): uid = trans_uid_list[i] user_domain_data[uid]['bio_str'] = '_'.join( temp_trans_bio_data[4 * i:4 * i + 4]) trans_uid_list = [] untrans_bio_data = [] n = n - 1 else: if len(trans_uid_list) == (len(user_domain_data) % cut): temp_trans_bio_data = trans_bio_data(untrans_bio_data) for i in range(len(trans_uid_list)): uid = trans_uid_list[i] user_domain_data[uid]['bio_str'] = '_'.join( temp_trans_bio_data[4 * i:4 * i + 4]) trans_uid_list = [] untrans_bio_data = [] #domian计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)
def test_classify(f_name): user_info = load_user_info(f_name) #加载用户背景信息 print len(user_info) user_text = load_user_text(f_name) #加载用户发帖信息 print len(user_text) #user_interaction = load_user_interaction()#加载用户交互信息 user_data = combine_user_info(user_info, user_text) print len(user_data) user_label = domain_main(user_data) print len(user_label) ## lawyer_keyword = dict() ## university_keyword = dict() ## media_keyword = dict() ## mediaworker_keyword = dict() ## admin_keyword = dict() ## for k,v in user_data.iteritems(): ## bio_string = v['username'] + '_' + v['description'] ## if len(bio_string) <= 1: ## continue ## if k in lawyer_uid: ## bio_string_s = cc.convert(bio_string.decode('utf-8')) ## kwdlist = cut(s, bio_string_s.encode('utf-8')) ## for w in kwdlist: ## try: ## lawyer_keyword[w] = lawyer_keyword[w] + 1 ## except KeyError: ## lawyer_keyword[w] = 1 ## elif k in university_uid: ## bio_string_s = cc.convert(bio_string.decode('utf-8')) ## kwdlist = cut(s, bio_string_s.encode('utf-8')) ## for w in kwdlist: ## try: ## university_keyword[w] = university_keyword[w] + 1 ## except KeyError: ## university_keyword[w] = 1 ## elif k in media_uid: ## bio_string_s = cc.convert(bio_string.decode('utf-8')) ## kwdlist = cut(s, bio_string_s.encode('utf-8')) ## for w in kwdlist: ## try: ## media_keyword[w] = media_keyword[w] + 1 ## except KeyError: ## media_keyword[w] = 1 ## elif k in mediaworker_uid: ## bio_string_s = cc.convert(bio_string.decode('utf-8')) ## kwdlist = cut(s, bio_string_s.encode('utf-8')) ## for w in kwdlist: ## try: ## mediaworker_keyword[w] = mediaworker_keyword[w] + 1 ## except KeyError: ## mediaworker_keyword[w] = 1 ## elif k in admin_uid: ## bio_string_s = cc.convert(bio_string.decode('utf-8')) ## kwdlist = cut(s, bio_string_s.encode('utf-8')) ## for w in kwdlist: ## try: ## admin_keyword[w] = admin_keyword[w] + 1 ## except KeyError: ## admin_keyword[w] = 1 ## else: ## continue ## ## with open('./word_dict/admin_keywords.csv', 'wb') as f: ## writer = csv.writer(f) ## for k,v in admin_keyword.iteritems(): ## writer.writerow((k,v)) ## f.close() ## ## with open('./word_dict/mediaworker_keywords.csv', 'wb') as f: ## writer = csv.writer(f) ## for k,v in mediaworker_keyword.iteritems(): ## writer.writerow((k,v)) ## f.close() ## ## with open('./word_dict/media_keywords.csv', 'wb') as f: ## writer = csv.writer(f) ## for k,v in media_keyword.iteritems(): ## writer.writerow((k,v)) ## f.close() ## ## with open('./word_dict/university_keywords.csv', 'wb') as f: ## writer = csv.writer(f) ## for k,v in university_keyword.iteritems(): ## writer.writerow((k,v)) ## f.close() ## ## with open('./word_dict/lawyer_keywords.csv', 'wb') as f: ## writer = csv.writer(f) ## for k,v in lawyer_keyword.iteritems(): ## writer.writerow((k,v)) ## f.close() with open('./result/user_label.csv', 'wb') as f: writer = csv.writer(f) for k, v in user_label.iteritems(): if k in lawyer_uid: writer.writerow( (k, v, 'lawyer', user_data[k]['username'], user_data[k]['description'], user_data[k]['location'], user_data[k]['number_of_text'])) elif k in university_uid: writer.writerow( (k, v, 'university', user_data[k]['username'], user_data[k]['description'], user_data[k]['location'], user_data[k]['number_of_text'])) elif k in media_uid: writer.writerow( (k, v, 'media', user_data[k]['username'], user_data[k]['description'], user_data[k]['location'], user_data[k]['number_of_text'])) elif k in mediaworker_uid: writer.writerow( (k, v, 'mediaworker', user_data[k]['username'], user_data[k]['description'], user_data[k]['location'], user_data[k]['number_of_text'])) elif k in admin_uid: writer.writerow( (k, v, 'admin', user_data[k]['username'], user_data[k]['description'], user_data[k]['location'], user_data[k]['number_of_text'])) else: writer.writerow( (k, v, 'other', user_data[k]['username'], user_data[k]['description'], user_data[k]['location'], user_data[k]['number_of_text'])) f.close()