Пример #1
0
def gen_inverse_documen_fre():
    print "calc inverse document frequency---start"

    weibo_list_train = []
    weibo_list_test = []

    status_map_train = read_status()  # train
    for uid in status_map_train:
        content_concated = " ".join(
            [one_con_li[4] for one_con_li in status_map_train[uid]])
        weibo_list_train.append(content_concated)

    weibo_map_test = read_status(file_name='test/test_status.txt')
    for uid in weibo_map_test:
        content_concated = " ".join(
            [one_con_li[4] for one_con_li in weibo_map_test[uid]])
        weibo_list_test.append(content_concated)

    all_weibo = []
    all_weibo.extend(weibo_list_train)
    all_weibo.extend(weibo_list_test)
    N = len(weibo_list_test) + len(weibo_list_train)

    global inverse_doc_fre
    for one_weibo in all_weibo:
        for one_word in set(one_weibo.split(' ')):
            inverse_doc_fre[one_word] = 1.0 + inverse_doc_fre.get(
                one_word, 0.0)
    for one_word in inverse_doc_fre:
        inverse_doc_fre[one_word] = math.log(
            float(N) / inverse_doc_fre[one_word])
    print "calc inverse document frequency---end"
Пример #2
0
def gen_test_predict_result():
    print "test data predict -- start"
    test_status = read_status(file_name='test/test_status.txt')
    for uid in test_status:
        weibo_list = test_status[uid]
        true_loc_list = []

        for one_weibo in weibo_list:
            one_weibo_content = one_weibo[4]
            occur_loc_list = process_one_weibo_return_loclist_original(
                one_weibo_content)
            if len(occur_loc_list) != 0:
                for one_loc in occur_loc_list:
                    one_weibo_vec = train_vecdic_to_vec(
                        gen_vector_for_one_weibo(one_weibo_content))

                    if predit_oneloc_with_oneweibo(one_loc, one_weibo_vec):
                        true_loc_list.append(tran_loc_to_request(one_loc))

        if len(true_loc_list) != 0:
            test_predict_result[uid] = pick_location_this(true_loc_list)

        print "uid : ", uid, ' predict loc: ', test_predict_result[uid]
    print "test data predict -- end"
    pass
Пример #3
0
def train_and_predict_age():
    target_lable, feature_list = feature_construct_for_train(
        read_lable(), read_links(), read_status())

    feature_map = feature_construct_for_teat(
        read_links(file_name='test/test_links.txt'),
        read_status(file_name='test/test_status.txt'))

    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=10)
    clf = clf.fit(feature_list, target_lable)

    pre_result = {}
    for uid in feature_map:
        pre_result[uid] = clf.predict(feature_map[uid])[0]

    return pre_result
Пример #4
0
def gen_test_predict_result():
    print "test data predict -- start"
    test_status = read_status(file_name='test/test_status.txt')
    for uid in test_status:
        weibo_list = test_status[uid]
        
        true_loc_dic = {}
        
        for one_weibo in weibo_list:
            one_weibo_content = one_weibo[4]
            occur_loc_list = process_one_weibo_return_loclist_original(one_weibo_content)
            
            if len(occur_loc_list) != 0:
                one_weibo_vec = normalize(np.array(train_vecdic_to_vec(gen_vector_for_one_weibo(one_weibo_content))))
                for one_loc in occur_loc_list:
                    true_loc_dic[one_loc] = 0.5
                    if one_loc in loc_map_twotypes_data:
#                         true_loc_dic[one_loc] = max(true_loc_dic.get(one_loc, 0.0) , ((one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum() + 1.0) / ((one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum() + 1.0 + (one_weibo_vec * loc_map_twotypes_data[one_loc][0]).sum()))
                        true_loc_dic[one_loc] = true_loc_dic.get(one_loc, 0.0) + (one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum() - (one_weibo_vec * loc_map_twotypes_data[one_loc][0]).sum()
#                         print true_loc_dic[one_loc],(one_weibo_vec * loc_map_twotypes_data[one_loc][1]).sum(),(one_weibo_vec * loc_map_twotypes_data[one_loc][0]).sum()
                    else:
                        true_loc_dic[one_loc] =  0.8
#         if len(true_loc_list) != 0:
#             test_predict_result[uid] = pick_location_this(true_loc_list)
#         true_loc_for_pick = {}
#         for one_loc in true_loc_dic:
#             true_loc_for_pick[tran_loc_to_request(one_loc)] = true_loc_for_pick.get(tran_loc_to_request(one_loc),0.0) + true_loc_dic[one_loc] 
        
        requested_true_loc = {}
        for loc in true_loc_dic:
            if tran_loc_to_request(loc) not in requested_true_loc:
                requested_true_loc[tran_loc_to_request(loc)] = 0.0
            requested_true_loc[tran_loc_to_request(loc)] += true_loc_dic[loc]
        
        if len(requested_true_loc) !=0:
            max_val = -1.0;
            max_loc  = ""
            for one_loc in requested_true_loc:
                if requested_true_loc[one_loc] > max_val:
                    max_val = requested_true_loc[one_loc]
                    max_loc = one_loc
            
            test_predict_result[uid] = max_loc
        
#             print "uid : ", uid, ' predict loc: ', test_predict_result[uid]
    print "test data predict -- end"
    pass
Пример #5
0
def for_train():
    status_map_train = read_status()  # train
    lable_train = read_lable_original_for_loc()
    
    print "-- construct data for train -- start"
    for uid in lable_train:
        weibo_list = status_map_train[uid]
        loc = lable_train[uid]['loc']
        
        for one_weibo in weibo_list:
            weibo_content = one_weibo[4]
            
            occur_loc_list = process_one_weibo_return_loclist_original(weibo_content)
            if len(occur_loc_list) != 0:
                this_vec = gen_vector_for_one_weibo(weibo_content)
                
                for occur_loc in occur_loc_list:
                    # 初始化 大容器
                    if occur_loc not in loc_map_twotypes_data:
                        loc_map_twotypes_data[occur_loc] = {}
                        loc_map_twotypes_data[occur_loc][0] = np.array([0.0 for i in range(len(dictionry))])
                        loc_map_twotypes_data[occur_loc][10] = 1.0
                        loc_map_twotypes_data[occur_loc][1] = np.array([0.0 for i in range(len(dictionry))])
                        loc_map_twotypes_data[occur_loc][11] = 1.0
                        
                    
                    for i in this_vec:
                        if occur_loc == loc:
                            loc_map_twotypes_data[occur_loc][1][i] += this_vec[i]
                            loc_map_twotypes_data[occur_loc][11] += 1.0
                        else:
                            loc_map_twotypes_data[occur_loc][0][i] += this_vec[i]
                            loc_map_twotypes_data[occur_loc][10] += 1.0        
    
    for one_loc in loc_map_twotypes_data:
        loc_map_twotypes_data[one_loc][0] = loc_map_twotypes_data[one_loc][0]/loc_map_twotypes_data[occur_loc][10]
        loc_map_twotypes_data[one_loc][1] = loc_map_twotypes_data[one_loc][1]/loc_map_twotypes_data[occur_loc][11]
        print loc_map_twotypes_data[one_loc][0].sum()
    
    print "-- construct data for train -- end"    
Пример #6
0
from tutils_about_weibo import tranfiorm_age
from utils_about_weibo_3 import read_lable, read_links, read_status


def get_unique_source(weibo_map):
    unique_source = {}
    for uid in weibo_map:
        for one_weibo in weibo_map[uid]:
            source = one_weibo[2]
            if source not in unique_source:
                unique_source[source] = 1

    return unique_source.keys()


unique_source_list = get_unique_source(read_status())


def feature_construct_for_train(label_map, links_map, weibo_map):

    target_lable = []
    feature_list = []

    for uid in label_map:
        target_lable.append(tranfiorm_age(int(label_map[uid]['age'])))

        this_feature = []

        # 第一个特征,有多少个粉丝
        this_feature.append(len(links_map.get(uid, [1])))
Пример #7
0
def gen_test_predict_result_use_PINGLV():
    read_wordmap()
    status_map_train = read_status()  # train
    lable_train = read_lable_original_for_loc()

    print "-- construct data for train -- start"
    for uid in lable_train:
        weibo_list = status_map_train[uid]
        loc = lable_train[uid]['loc']

        for one_weibo in weibo_list:
            weibo_content = one_weibo[4]

            occur_loc_list = process_one_weibo_return_loclist_original(
                weibo_content)
            if len(occur_loc_list) != 0:

                for occur_loc in occur_loc_list:
                    # 初始化 大容器
                    if occur_loc not in loc_map_twotypes_data:
                        loc_map_twotypes_data[occur_loc] = {}
                        loc_map_twotypes_data[occur_loc][0] = 0.0
                        loc_map_twotypes_data[occur_loc][1] = 1.0


#                     print this_vec
                    if occur_loc == loc:
                        loc_map_twotypes_data[occur_loc][1] += 1.0
                    else:
                        loc_map_twotypes_data[occur_loc][0] += 1.0
    print "-- construct data for train -- end"

    print "test data predict -- start"
    test_status = read_status(file_name='test/test_status.txt')
    for uid in test_status:
        weibo_list = test_status[uid]
        true_loc_dic = {}
        oriloc_list = []

        for one_weibo in weibo_list:
            one_weibo_content = one_weibo[4]
            occur_loc_list = process_one_weibo_return_loclist_original(
                one_weibo_content)

            if len(occur_loc_list) != 0:
                for one_loc in occur_loc_list:
                    trust_val = 1.0
                    if one_loc in loc_map_twotypes_data:
                        trust_val = loc_map_twotypes_data[one_loc][1] / (
                            loc_map_twotypes_data[one_loc][0] +
                            loc_map_twotypes_data[one_loc][1])
                    oriloc_list.append(one_loc + "-" + str(trust_val))
                    request_loc = tran_loc_to_request(one_loc)
                    true_loc_dic[request_loc] = true_loc_dic.get(
                        request_loc, 0.0) + trust_val

        if len(true_loc_dic) != 0:

            max_val = -1.0
            max_loc = ""
            for one_loc in true_loc_dic:
                if true_loc_dic[one_loc] > max_val:
                    max_val = true_loc_dic[one_loc]
                    max_loc = one_loc

            test_predict_result[uid] = max_loc

        one_p_line = ""
        for i in true_loc_dic:
            one_p_line += " " + str(i) + ":" + str(true_loc_dic[i])

        print "ori_loc :", ' '.join(oriloc_list)
        print "transform_loc :", one_p_line[1:]
        print "uid : ", uid, ' predict loc: ', test_predict_result.get(
            uid, '---')
    print "test data predict -- end"

    pass
Пример #8
0
def for_train():
    status_map_train = read_status()  # train
    lable_train = read_lable_original_for_loc()

    print "-- construct data for train -- start"
    for uid in lable_train:
        weibo_list = status_map_train[uid]
        loc = lable_train[uid]['loc']

        for one_weibo in weibo_list:
            weibo_content = one_weibo[4]

            occur_loc_list = process_one_weibo_return_loclist_original(
                weibo_content)
            if len(occur_loc_list) != 0:
                this_vec = gen_vector_for_one_weibo(weibo_content)

                for occur_loc in occur_loc_list:
                    # 初始化 大容器
                    if occur_loc not in loc_map_twotypes_data:
                        loc_map_twotypes_data[occur_loc] = {}
                        loc_map_twotypes_data[occur_loc][0] = []
                        loc_map_twotypes_data[occur_loc][1] = []

                        loc_map_twotypes_clf[occur_loc] = {}

#                     print this_vec
                    if occur_loc == loc:
                        loc_map_twotypes_data[occur_loc][1].append(this_vec)
                    else:
                        loc_map_twotypes_data[occur_loc][0].append(this_vec)
    print "-- construct data for train -- end"

    print "--print data size -- start"
    for one_loc in loc_map_twotypes_data:
        print 'location :', one_loc, " true size: ", len(
            loc_map_twotypes_data[one_loc][1]), " false size: ", len(
                loc_map_twotypes_data[one_loc][0])
    print "--print data size -- end"

    print "-- train the clf model -- start"
    for index, one_loc in enumerate(loc_map_twotypes_clf.keys()):

        print 'location :', one_loc, " true size: ", len(
            loc_map_twotypes_data[one_loc][1]), " false size: ", len(
                loc_map_twotypes_data[one_loc]
                [0]), " index: ", index, " all: ", len(
                    loc_map_twotypes_clf.keys())

        lable_list = []
        data_list = []
        max_size = max([
            len(loc_map_twotypes_data[one_loc][1]),
            len(loc_map_twotypes_data[one_loc][0])
        ])
        loc_map_twotypes_data[one_loc][1].extend(
            select_x_this(loc_map_twotypes_data[one_loc][1],
                          max_size - len(loc_map_twotypes_data[one_loc][1])))
        loc_map_twotypes_data[one_loc][0].extend(
            select_x_this(loc_map_twotypes_data[one_loc][0],
                          max_size - len(loc_map_twotypes_data[one_loc][0])))

        lable_list = [
            0.0 for i in range(len(loc_map_twotypes_data[one_loc][0]))
        ]
        lable_list.extend(
            [1.0 for i in range(len(loc_map_twotypes_data[one_loc][1]))])

        data_list = loc_map_twotypes_data[one_loc][0]
        data_list.extend(loc_map_twotypes_data[one_loc][1])

        X = []
        for one_dic_vec in data_list:
            X.append(train_vecdic_to_vec(one_dic_vec))

#         print data_list
#         print 'location :',one_loc,"one loc train data size: ",len(data_list)," lable size: ",len(lable_list)
#         from sklearn.ensemble import RandomForestClassifier
#         clf = RandomForestClassifier(n_estimators=5, n_jobs=-1)
#         clf = clf.fit(X, lable_list)

        clf = svm.LinearSVC()
        clf.fit(X, lable_list)
        #         pre_label = clf.predict(p_X)

        loc_map_twotypes_clf[one_loc] = clf
    print "-- train the clf model -- end"

    pass
Пример #9
0
#     for word in before_79_map_word_count_source.keys():
#         before_79_map_word_count_source[word] = float(before_79_map_word_count_source[word]) / count____before_79_map_word_weibo_num
#     for word in in_80_to_89__map_word_count_source.keys():
#         in_80_to_89__map_word_count_source[word] = float(in_80_to_89__map_word_count_source[word]) / count____in_80_to_89__map_word_weibo_num
#     for word in past_90__map_word_count_source.keys():
#         past_90__map_word_count_source[word] = float(past_90__map_word_count_source[word]) / count____past_90__map_word_weibo_num
#
#     for word in before_79_map_word_count_content.keys():
#         before_79_map_word_count_content[word] = float(before_79_map_word_count_content[word]) / count____before_79_map_word_weibo_num
#     for word in in_80_to_89__map_word_count_content.keys():
#         in_80_to_89__map_word_count_content[word] = float(in_80_to_89__map_word_count_content[word]) / count____in_80_to_89__map_word_weibo_num
#     for word in past_90__map_word_count_content.keys():
#         past_90__map_word_count_content[word] = float(past_90__map_word_count_content[word]) / count____past_90__map_word_weibo_num

#     print len(before_79_map_word_count_source)
#     print len(in_80_to_89__map_word_count_source)
#     print len(past_90__map_word_count_source)
#
#     print len(before_79_map_word_count_content)
#     print len(in_80_to_89__map_word_count_content)
#     print len(past_90__map_word_count_content)

    return before_79_map_word_count_source, in_80_to_89__map_word_count_source, past_90__map_word_count_source, before_79_map_word_count_content, in_80_to_89__map_word_count_content, past_90__map_word_count_content
    pass

if __name__ == '__main__':
    #     read_train_weibo_status(read_lable(), read_status())
    read_train_weibo_status_for_age(read_lable(), read_status())
    pass
Пример #10
0
def gen_model_predict_result():
    print "1,construct dataset"
    status_map_train = read_status()  # train
    status_map_test = read_status('test/test_status.txt')  # train

    train_lable_map = read_lable()

    uid_list_train = []
    age_list_train = []
    sex_list_train = []
    loc_list_train = []
    weibo_list_train = []

    uid_list_test = []
    weibo_list_test = []

    others_feature_train = []
    label_map_train = train_lable_map
    links_map_train = read_links()
    weibo_map_train = status_map_train

    for uid in status_map_train:

        content_concated = " ".join(
            [one_con_li[4] for one_con_li in status_map_train[uid]])
        age = tranfiorm_age(train_lable_map[uid]['age'])
        sex = train_lable_map[uid]['sex']
        loc = train_lable_map[uid]['loc']

        uid_list_train.append(uid)
        age_list_train.append(age)
        sex_list_train.append(sex)
        loc_list_train.append(loc)
        weibo_list_train.append(content_concated)

        this_feature = []

        # 第一个特征,有多少个粉丝
        this_feature.append(len(links_map_train.get(uid, [1])))

        # 第2,微博数
        weibo_num = len(weibo_map_train.get(uid, [1]))
        this_feature.append(weibo_num)

        # 微博重复数
        unique_source = {}
        unique_weibo = {}

        source_num = {}
        for one_weibo in weibo_map_train[uid]:
            source = one_weibo[2]
            content = one_weibo[4]
            if source not in unique_source:
                unique_source[source] = 1
            if content not in unique_weibo:
                unique_weibo[content] = 1

            source_num[source] = source_num.get(source, 0.0) + 1.0

        this_feature.append(weibo_num / float(len(unique_source)))
        this_feature.append(weibo_num / float(len(unique_weibo)))

        this_list_for_source = []
        for one_source in unique_source_list:
            this_list_for_source.append(source_num[one_source] if one_source in
                                        source_num else 0.0)
        this_feature.extend(this_list_for_source)

        others_feature_train.append(this_feature)

    links_map_test = read_links(file_name='test/test_links.txt')
    weibo_map_test = read_status(file_name='test/test_status.txt')
    others_feature_test = []
    for uid in status_map_test:

        content_concated = " ".join(
            [one_con_li[4] for one_con_li in status_map_test[uid]])

        uid_list_test.append(uid)
        weibo_list_test.append(content_concated)

        this_feature = []

        # 第一个特征,有多少个粉丝
        this_feature.append(len(links_map_test.get(uid, [1])))

        # 第2,微博数
        weibo_num = len(weibo_map_test.get(uid, [1]))
        this_feature.append(weibo_num)

        # 微博重复数
        unique_source = {}
        unique_weibo = {}

        source_num = {}
        for one_weibo in weibo_map_test[uid]:
            source = one_weibo[2]
            content = one_weibo[4]
            if source not in unique_source:
                unique_source[source] = 1
            if content not in unique_weibo:
                unique_weibo[content] = 1

            source_num[source] = source_num.get(source, 0.0) + 1.0

        this_feature.append(weibo_num / float(len(unique_source)))
        this_feature.append(weibo_num / float(len(unique_weibo)))

        this_list_for_source = []
        for one_source in unique_source_list:
            this_list_for_source.append(source_num[one_source] if one_source in
                                        source_num else 0.0)

        this_feature.extend(this_list_for_source)
        others_feature_test.append(this_feature)

#     store_to_file(weibo_list_train, weibo_list_test)
# one_hot
    all_features_list = gen_feature_list(weibo_list_train, weibo_list_test)

    # 添加原始feature
    print "添加原始feature start"
    for index, da in enumerate(others_feature_train):
        all_features_list[index].extend(da)
    for index, da in enumerate(others_feature_test):
        all_features_list[index + len(others_feature_train)].extend(da)

    X = all_features_list[:len(weibo_list_train)]
    p_X = all_features_list[len(weibo_list_train):]
    print "添加原始feature end"

    print "2,one hot and lda done, start training and predict"
    return train_and_predict(X, age_list_train, p_X,
                             uid_list_test), train_and_predict(
                                 X, sex_list_train, p_X,
                                 uid_list_test), train_and_predict(
                                     X, loc_list_train, p_X, uid_list_test)