def gen_inverse_documen_fre(): print "calc inverse document frequency---start" weibo_list_train = [] weibo_list_test = [] status_map_train = read_status() # train for uid in status_map_train: content_concated = " ".join( [one_con_li[4] for one_con_li in status_map_train[uid]]) weibo_list_train.append(content_concated) weibo_map_test = read_status(file_name='test/test_status.txt') for uid in weibo_map_test: content_concated = " ".join( [one_con_li[4] for one_con_li in weibo_map_test[uid]]) weibo_list_test.append(content_concated) all_weibo = [] all_weibo.extend(weibo_list_train) all_weibo.extend(weibo_list_test) N = len(weibo_list_test) + len(weibo_list_train) global inverse_doc_fre for one_weibo in all_weibo: for one_word in set(one_weibo.split(' ')): inverse_doc_fre[one_word] = 1.0 + inverse_doc_fre.get( one_word, 0.0) for one_word in inverse_doc_fre: inverse_doc_fre[one_word] = math.log( float(N) / inverse_doc_fre[one_word]) print "calc inverse document frequency---end"
def gen_test_predict_result(): print "test data predict -- start" test_status = read_status(file_name='test/test_status.txt') for uid in test_status: weibo_list = test_status[uid] true_loc_list = [] for one_weibo in weibo_list: one_weibo_content = one_weibo[4] occur_loc_list = process_one_weibo_return_loclist_original( one_weibo_content) if len(occur_loc_list) != 0: for one_loc in occur_loc_list: one_weibo_vec = train_vecdic_to_vec( gen_vector_for_one_weibo(one_weibo_content)) if predit_oneloc_with_oneweibo(one_loc, one_weibo_vec): true_loc_list.append(tran_loc_to_request(one_loc)) if len(true_loc_list) != 0: test_predict_result[uid] = pick_location_this(true_loc_list) print "uid : ", uid, ' predict loc: ', test_predict_result[uid] print "test data predict -- end" pass
def train_and_predict_age(): target_lable, feature_list = feature_construct_for_train( read_lable(), read_links(), read_status()) feature_map = feature_construct_for_teat( read_links(file_name='test/test_links.txt'), read_status(file_name='test/test_status.txt')) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(feature_list, target_lable) pre_result = {} for uid in feature_map: pre_result[uid] = clf.predict(feature_map[uid])[0] return pre_result
def gen_test_predict_result(): print "test data predict -- start" test_status = read_status(file_name='test/test_status.txt') for uid in test_status: weibo_list = test_status[uid] true_loc_dic = {} for one_weibo in weibo_list: one_weibo_content = one_weibo[4] occur_loc_list = process_one_weibo_return_loclist_original(one_weibo_content) if len(occur_loc_list) != 0: one_weibo_vec = normalize(np.array(train_vecdic_to_vec(gen_vector_for_one_weibo(one_weibo_content)))) for one_loc in occur_loc_list: true_loc_dic[one_loc] = 0.5 if one_loc in loc_map_twotypes_data: # true_loc_dic[one_loc] = max(true_loc_dic.get(one_loc, 0.0) , ((one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum() + 1.0) / ((one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum() + 1.0 + (one_weibo_vec * loc_map_twotypes_data[one_loc][0]).sum())) true_loc_dic[one_loc] = true_loc_dic.get(one_loc, 0.0) + (one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum() - (one_weibo_vec * loc_map_twotypes_data[one_loc][0]).sum() # print true_loc_dic[one_loc],(one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum(),(one_weibo_vec * loc_map_twotypes_data[one_loc][0]).sum() else: true_loc_dic[one_loc] = 0.8 # if len(true_loc_list) != 0: # test_predict_result[uid] = pick_location_this(true_loc_list) # true_loc_for_pick = {} # for one_loc in true_loc_dic: # true_loc_for_pick[tran_loc_to_request(one_loc)] = true_loc_for_pick.get(tran_loc_to_request(one_loc),0.0) + true_loc_dic[one_loc] requested_true_loc = {} for loc in true_loc_dic: if tran_loc_to_request(loc) not in requested_true_loc: requested_true_loc[tran_loc_to_request(loc)] = 0.0 requested_true_loc[tran_loc_to_request(loc)] += true_loc_dic[loc] if len(requested_true_loc) !=0: max_val = -1.0; max_loc = "" for one_loc in requested_true_loc: if requested_true_loc[one_loc] > max_val: max_val = requested_true_loc[one_loc] max_loc = one_loc test_predict_result[uid] = max_loc # print "uid : ", uid, ' predict loc: ', test_predict_result[uid] print "test data predict -- end" pass
def for_train(): status_map_train = read_status() # train lable_train = read_lable_original_for_loc() print "-- construct data for train -- start" for uid in lable_train: weibo_list = status_map_train[uid] loc = lable_train[uid]['loc'] for one_weibo in weibo_list: weibo_content = one_weibo[4] occur_loc_list = process_one_weibo_return_loclist_original(weibo_content) if len(occur_loc_list) != 0: this_vec = gen_vector_for_one_weibo(weibo_content) for occur_loc in occur_loc_list: # 初始化 大容器 if occur_loc not in loc_map_twotypes_data: loc_map_twotypes_data[occur_loc] = {} loc_map_twotypes_data[occur_loc][0] = np.array([0.0 for i in range(len(dictionry))]) loc_map_twotypes_data[occur_loc][10] = 1.0 loc_map_twotypes_data[occur_loc][1] = np.array([0.0 for i in range(len(dictionry))]) loc_map_twotypes_data[occur_loc][11] = 1.0 for i in this_vec: if occur_loc == loc: loc_map_twotypes_data[occur_loc][1][i] += this_vec[i] loc_map_twotypes_data[occur_loc][11] += 1.0 else: loc_map_twotypes_data[occur_loc][0][i] += this_vec[i] loc_map_twotypes_data[occur_loc][10] += 1.0 for one_loc in loc_map_twotypes_data: loc_map_twotypes_data[one_loc][0] = loc_map_twotypes_data[one_loc][0]/loc_map_twotypes_data[occur_loc][10] loc_map_twotypes_data[one_loc][1] = loc_map_twotypes_data[one_loc][1]/loc_map_twotypes_data[occur_loc][11] print loc_map_twotypes_data[one_loc][0].sum() print "-- construct data for train -- end"
from tutils_about_weibo import tranfiorm_age from utils_about_weibo_3 import read_lable, read_links, read_status def get_unique_source(weibo_map): unique_source = {} for uid in weibo_map: for one_weibo in weibo_map[uid]: source = one_weibo[2] if source not in unique_source: unique_source[source] = 1 return unique_source.keys() unique_source_list = get_unique_source(read_status()) def feature_construct_for_train(label_map, links_map, weibo_map): target_lable = [] feature_list = [] for uid in label_map: target_lable.append(tranfiorm_age(int(label_map[uid]['age']))) this_feature = [] # 第一个特征,有多少个粉丝 this_feature.append(len(links_map.get(uid, [1])))
def gen_test_predict_result_use_PINGLV(): read_wordmap() status_map_train = read_status() # train lable_train = read_lable_original_for_loc() print "-- construct data for train -- start" for uid in lable_train: weibo_list = status_map_train[uid] loc = lable_train[uid]['loc'] for one_weibo in weibo_list: weibo_content = one_weibo[4] occur_loc_list = process_one_weibo_return_loclist_original( weibo_content) if len(occur_loc_list) != 0: for occur_loc in occur_loc_list: # 初始化 大容器 if occur_loc not in loc_map_twotypes_data: loc_map_twotypes_data[occur_loc] = {} loc_map_twotypes_data[occur_loc][0] = 0.0 loc_map_twotypes_data[occur_loc][1] = 1.0 # print this_vec if occur_loc == loc: loc_map_twotypes_data[occur_loc][1] += 1.0 else: loc_map_twotypes_data[occur_loc][0] += 1.0 print "-- construct data for train -- end" print "test data predict -- start" test_status = read_status(file_name='test/test_status.txt') for uid in test_status: weibo_list = test_status[uid] true_loc_dic = {} oriloc_list = [] for one_weibo in weibo_list: one_weibo_content = one_weibo[4] occur_loc_list = process_one_weibo_return_loclist_original( one_weibo_content) if len(occur_loc_list) != 0: for one_loc in occur_loc_list: trust_val = 1.0 if one_loc in loc_map_twotypes_data: trust_val = loc_map_twotypes_data[one_loc][1] / ( loc_map_twotypes_data[one_loc][0] + loc_map_twotypes_data[one_loc][1]) oriloc_list.append(one_loc + "-" + str(trust_val)) request_loc = tran_loc_to_request(one_loc) true_loc_dic[request_loc] = true_loc_dic.get( request_loc, 0.0) + trust_val if len(true_loc_dic) != 0: max_val = -1.0 max_loc = "" for one_loc in true_loc_dic: if true_loc_dic[one_loc] > max_val: max_val = true_loc_dic[one_loc] max_loc = one_loc test_predict_result[uid] = max_loc one_p_line = "" for i in true_loc_dic: one_p_line += " " + str(i) + ":" + str(true_loc_dic[i]) print "ori_loc :", ' '.join(oriloc_list) print "transform_loc :", one_p_line[1:] print "uid : ", uid, ' predict loc: ', test_predict_result.get( uid, '---') print "test data predict -- end" pass
def for_train(): status_map_train = read_status() # train lable_train = read_lable_original_for_loc() print "-- construct data for train -- start" for uid in lable_train: weibo_list = status_map_train[uid] loc = lable_train[uid]['loc'] for one_weibo in weibo_list: weibo_content = one_weibo[4] occur_loc_list = process_one_weibo_return_loclist_original( weibo_content) if len(occur_loc_list) != 0: this_vec = gen_vector_for_one_weibo(weibo_content) for occur_loc in occur_loc_list: # 初始化 大容器 if occur_loc not in loc_map_twotypes_data: loc_map_twotypes_data[occur_loc] = {} loc_map_twotypes_data[occur_loc][0] = [] loc_map_twotypes_data[occur_loc][1] = [] loc_map_twotypes_clf[occur_loc] = {} # print this_vec if occur_loc == loc: loc_map_twotypes_data[occur_loc][1].append(this_vec) else: loc_map_twotypes_data[occur_loc][0].append(this_vec) print "-- construct data for train -- end" print "--print data size -- start" for one_loc in loc_map_twotypes_data: print 'location :', one_loc, " true size: ", len( loc_map_twotypes_data[one_loc][1]), " false size: ", len( loc_map_twotypes_data[one_loc][0]) print "--print data size -- end" print "-- train the clf model -- start" for index, one_loc in enumerate(loc_map_twotypes_clf.keys()): print 'location :', one_loc, " true size: ", len( loc_map_twotypes_data[one_loc][1]), " false size: ", len( loc_map_twotypes_data[one_loc] [0]), " index: ", index, " all: ", len( loc_map_twotypes_clf.keys()) lable_list = [] data_list = [] max_size = max([ len(loc_map_twotypes_data[one_loc][1]), len(loc_map_twotypes_data[one_loc][0]) ]) loc_map_twotypes_data[one_loc][1].extend( select_x_this(loc_map_twotypes_data[one_loc][1], max_size - len(loc_map_twotypes_data[one_loc][1]))) loc_map_twotypes_data[one_loc][0].extend( select_x_this(loc_map_twotypes_data[one_loc][0], max_size - len(loc_map_twotypes_data[one_loc][0]))) lable_list = [ 0.0 for i in range(len(loc_map_twotypes_data[one_loc][0])) ] lable_list.extend( [1.0 for i in range(len(loc_map_twotypes_data[one_loc][1]))]) data_list = loc_map_twotypes_data[one_loc][0] data_list.extend(loc_map_twotypes_data[one_loc][1]) X = [] for one_dic_vec in data_list: X.append(train_vecdic_to_vec(one_dic_vec)) # print data_list # print 'location :',one_loc,"one loc train data size: ",len(data_list)," lable size: ",len(lable_list) # from sklearn.ensemble import RandomForestClassifier # clf = RandomForestClassifier(n_estimators=5, n_jobs=-1) # clf = clf.fit(X, lable_list) clf = svm.LinearSVC() clf.fit(X, lable_list) # pre_label = clf.predict(p_X) loc_map_twotypes_clf[one_loc] = clf print "-- train the clf model -- end" pass
# for word in before_79_map_word_count_source.keys(): # before_79_map_word_count_source[word] = float(before_79_map_word_count_source[word]) / count____before_79_map_word_weibo_num # for word in in_80_to_89__map_word_count_source.keys(): # in_80_to_89__map_word_count_source[word] = float(in_80_to_89__map_word_count_source[word]) / count____in_80_to_89__map_word_weibo_num # for word in past_90__map_word_count_source.keys(): # past_90__map_word_count_source[word] = float(past_90__map_word_count_source[word]) / count____past_90__map_word_weibo_num # # for word in before_79_map_word_count_content.keys(): # before_79_map_word_count_content[word] = float(before_79_map_word_count_content[word]) / count____before_79_map_word_weibo_num # for word in in_80_to_89__map_word_count_content.keys(): # in_80_to_89__map_word_count_content[word] = float(in_80_to_89__map_word_count_content[word]) / count____in_80_to_89__map_word_weibo_num # for word in past_90__map_word_count_content.keys(): # past_90__map_word_count_content[word] = float(past_90__map_word_count_content[word]) / count____past_90__map_word_weibo_num # print len(before_79_map_word_count_source) # print len(in_80_to_89__map_word_count_source) # print len(past_90__map_word_count_source) # # print len(before_79_map_word_count_content) # print len(in_80_to_89__map_word_count_content) # print len(past_90__map_word_count_content) return before_79_map_word_count_source, in_80_to_89__map_word_count_source, past_90__map_word_count_source, before_79_map_word_count_content, in_80_to_89__map_word_count_content, past_90__map_word_count_content pass if __name__ == '__main__': # read_train_weibo_status(read_lable(), read_status()) read_train_weibo_status_for_age(read_lable(), read_status()) pass
def gen_model_predict_result(): print "1,construct dataset" status_map_train = read_status() # train status_map_test = read_status('test/test_status.txt') # train train_lable_map = read_lable() uid_list_train = [] age_list_train = [] sex_list_train = [] loc_list_train = [] weibo_list_train = [] uid_list_test = [] weibo_list_test = [] others_feature_train = [] label_map_train = train_lable_map links_map_train = read_links() weibo_map_train = status_map_train for uid in status_map_train: content_concated = " ".join( [one_con_li[4] for one_con_li in status_map_train[uid]]) age = tranfiorm_age(train_lable_map[uid]['age']) sex = train_lable_map[uid]['sex'] loc = train_lable_map[uid]['loc'] uid_list_train.append(uid) age_list_train.append(age) sex_list_train.append(sex) loc_list_train.append(loc) weibo_list_train.append(content_concated) this_feature = [] # 第一个特征,有多少个粉丝 this_feature.append(len(links_map_train.get(uid, [1]))) # 第2,微博数 weibo_num = len(weibo_map_train.get(uid, [1])) this_feature.append(weibo_num) # 微博重复数 unique_source = {} unique_weibo = {} source_num = {} for one_weibo in weibo_map_train[uid]: source = one_weibo[2] content = one_weibo[4] if source not in unique_source: unique_source[source] = 1 if content not in unique_weibo: unique_weibo[content] = 1 source_num[source] = source_num.get(source, 0.0) + 1.0 this_feature.append(weibo_num / float(len(unique_source))) this_feature.append(weibo_num / float(len(unique_weibo))) this_list_for_source = [] for one_source in unique_source_list: this_list_for_source.append(source_num[one_source] if one_source in source_num else 0.0) this_feature.extend(this_list_for_source) others_feature_train.append(this_feature) links_map_test = read_links(file_name='test/test_links.txt') weibo_map_test = read_status(file_name='test/test_status.txt') others_feature_test = [] for uid in status_map_test: content_concated = " ".join( [one_con_li[4] for one_con_li in status_map_test[uid]]) uid_list_test.append(uid) weibo_list_test.append(content_concated) this_feature = [] # 第一个特征,有多少个粉丝 this_feature.append(len(links_map_test.get(uid, [1]))) # 第2,微博数 weibo_num = len(weibo_map_test.get(uid, [1])) this_feature.append(weibo_num) # 微博重复数 unique_source = {} unique_weibo = {} source_num = {} for one_weibo in weibo_map_test[uid]: source = one_weibo[2] content = one_weibo[4] if source not in unique_source: unique_source[source] = 1 if content not in unique_weibo: unique_weibo[content] = 1 source_num[source] = source_num.get(source, 0.0) + 1.0 this_feature.append(weibo_num / float(len(unique_source))) this_feature.append(weibo_num / float(len(unique_weibo))) this_list_for_source = [] for one_source in unique_source_list: this_list_for_source.append(source_num[one_source] if one_source in source_num else 0.0) this_feature.extend(this_list_for_source) others_feature_test.append(this_feature) # store_to_file(weibo_list_train, weibo_list_test) # one_hot all_features_list = gen_feature_list(weibo_list_train, weibo_list_test) # 添加原始feature print "添加原始feature start" for index, da in enumerate(others_feature_train): all_features_list[index].extend(da) for index, da in enumerate(others_feature_test): all_features_list[index + len(others_feature_train)].extend(da) X = all_features_list[:len(weibo_list_train)] p_X = all_features_list[len(weibo_list_train):] print "添加原始feature end" print "2,one hot and lda done, start training and predict" return train_and_predict(X, age_list_train, p_X, uid_list_test), train_and_predict( X, sex_list_train, p_X, uid_list_test), train_and_predict( X, loc_list_train, p_X, uid_list_test)