Пример #1
0
def extract_features(eid, tree, label):
    features = {}
    # 初始化
    for i in range(1, 11):
        features[i] = {
            'count': 0,
            # '社交网络特征'
            'rep_count': 0,
            'comments_count': 0,
            # 文本特征
            # 'text_length': 0,
            # 'text_NN_rat': 0,
            #  'text_verb_rat': 0,
            #  'text_adj_rat': 0,
            #
            #  'pos_count': 0,
            #  'neg_count': 0,
            #  'neu_count': 0,
            #  '@_count': 0,
            #  'stopword_count': 0,

            # 用户特征
            'bi_followers_count': 0,
            # 'user_des_len': 0,
            'friends_count': 0,
            'verified_count': 0,
            'followers_count': 0,
            'statuses_count': 0,
            'male_count': 0,  # m男 f女
            'female_count': 0,
            'favourites_count': 0,
        }

    #提取特征,从第一层到第十层
    for node in tree.all_nodes_itr():
        level = tree.depth(node=node)
        if level <= 10 and level > 0:  #只统计1到10层的个数
            features[level]['count'] += 1
            features[level]['rep_count'] += node.data.reposts_count
            features[level]['comments_count'] += node.data.comments_count

            # features[level]['pos_count'] += node.data.reposts_count
            # features[level]['neg_count'] += node.data.reposts_count
            # features[level]['neu_count'] += node.data.reposts_count

            features[level][
                'bi_followers_count'] += node.data.bi_followers_count
            features[level]['friends_count'] += node.data.friends_count
            if node.data.verified == True:
                features[level]['verified_count'] += node.data.verified
            features[level]['followers_count'] += node.data.followers_count
            features[level]['statuses_count'] += node.data.statuses_count
            if node.data.gender == 'm':
                features[level]['male_count'] += 1
            elif node.data.gender == 'f':
                features[level]['female_count'] += 1
            features[level]['favourites_count'] += node.data.favourites_count

    #对第一层到第十层的特征求平均值
    for i in range(1, 11):
        if features[i]['count'] != 0:
            features[i].update({
                'rep_count':
                round(features[i]['rep_count'] / features[i]['count'], 2),
                'comments_count':
                round(features[i]['comments_count'] / features[i]['count'], 2),
                'bi_followers_count':
                round(features[i]['bi_followers_count'] / features[i]['count'],
                      2),
                'friends_count':
                round(features[i]['friends_count'] / features[i]['count'], 2),
                'verified_count':
                round(features[i]['verified_count'] / features[i]['count'], 2),
                'followers_count':
                round(features[i]['followers_count'] / features[i]['count'],
                      2),
                'statuses_count':
                round(features[i]['statuses_count'] / features[i]['count'], 2),
                'male_count':
                round(features[i]['male_count'] / features[i]['count'], 2),
                'female_count':
                round(features[i]['female_count'] / features[i]['count'], 2),
                'favourites_count':
                round(features[i]['favourites_count'] / features[i]['count'],
                      2),
            })

    return features
Пример #2
0
def cal_node_level_count(type):

    data = pd.read_csv('D:/chenjiao/SinaWeibo/datasets2/Weibo.txt',
                       sep='\t',
                       header=None)
    if type == 'fake':
        data = data.loc[data[1] == 'label:1']
    elif type == 'real':
        data = data.loc[data[1] == 'label:0']

    data_array = data.as_matrix()

    tree_dict_list = []
    max_depth = 0
    all_infos = []
    for i in range(data_array.shape[0]):
        eid = str(data_array[i][0]).replace('eid:', '')
        label = str(data_array[i][1].replace('label:', ''))
        load_f = open(
            'D:/chenjiao/SinaWeibo/datasets2/Weibo/{}.json'.format(eid),
            'r',
            encoding='utf-8')
        json_data = json.load(load_f)
        print('-----', eid)
        tree = Tree()
        tree.create_node(json_data[0].get("mid"), json_data[0].get("mid"))

        for j in range(1, len(json_data)):
            try:
                tree.create_node(json_data[j].get("mid"),
                                 json_data[j].get("mid"),
                                 parent=json_data[j].get("parent"))
            except:
                pass
        # tree.show()

        tree_depth = tree.depth()
        if tree_depth > max_depth:
            max_depth = tree_depth

        #统计各层节点个数
        tree_dict = {}
        for node in tree.all_nodes_itr():
            level = tree.depth(node=node)
            if level not in tree_dict:
                tree_dict[level] = 1
            else:
                tree_dict[level] += 1
        #统计完

        tree_dict_list.append(tree_dict)

    tree_levels_count_list = {}
    for dict in tree_dict_list:
        for i in range(max_depth + 1):
            if i in dict:
                if i in tree_levels_count_list:
                    tree_levels_count_list[i] += dict[i]
                else:
                    tree_levels_count_list[i] = dict[i]

    print(tree_levels_count_list)

    for key in tree_levels_count_list:
        print(key, tree_levels_count_list[key] / data_array.shape[0])