Python load_id_list示例，utils.load_id_list Python示例

示例#1

0

显示文件

文件： popularity_dataset.py 项目： hitalex/popularity

def main(group_id):

    #topiclist_path = 'data-dynamic/TopicList-' + group_id + 'shuffled-.txt' # for douban dataset
    topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset
    
    topic_list = load_id_list(topiclist_path)
    print 'Number of total topics loaded: ', len(topic_list)
    pop_level = [25, 50, float('inf')]  
    
    # prediction_date 的含义为：在帖子发布 prediction_date 时间后，开始预测
    # target_date 的含义为：预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=3)
    prediction_date = timedelta(hours=10*3)
    response_time = timedelta(hours=24)
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    alpha = 1.5
    percentage_threshold = 0.7
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    # 调整所有帖子的顺序
    # 在调试阶段，暂且不shuffle dataset，避免每次结果都不一样
    #shuffle(dataset)
    
    # 注意：每次使用的数据集是不同的
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)    
    print 'After down sampling...'
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]

示例#2

0

显示文件

文件： shuffle_dataset.py 项目： hitalex/popularity

def main(group_id):
    topiclist_path = 'data-dynamic/TopicList-' + group_id + '-shuffled.txt'
    topic_list = load_id_list(topiclist_path)
    random.shuffle(topic_list)
    
    topiclist_path = 'data-dynamic/TopicList-' + group_id + '-shuffled.txt'
    f = open(topiclist_path, 'w')
    for topic_id in topic_list:
        f.write(topic_id + '\n')
    f.close()

示例#3

0

显示文件

文件： histogram_popularity.py 项目： hitalex/popularity

def main(group_id):
    
    topic_list = load_id_list('data-dynamic/TopicList-' + group_id + '.txt')
    print 'Topics id loaded:', len(topic_list)
    
    base_path = 'data-dynamic/' + group_id + '/'
    
    x = [0] * len(topic_list)
    index = 0
    for topic_id in topic_list:
        path = base_path + topic_id + '.txt'
        
        if not os.path.exists(path):
            continue
        
        print 'Processing file: ', path
        f = open(path, 'r')
        #print 'Reading file: ', path
        
        line = f.readline().strip()
        seg_list = line.split('[=]')
        num_comment = int(seg_list[3])
        
        # 过滤掉那些总的评论数小于某个特定值的
        if num_comment < MIN_COMMENT:
            continue
        
        x[index] = num_comment
        index += 1
        
        f.close()
        
    x = x[:index]
    
    xmax = max(x)
    xmin = min(x)
    
    print 'Number of threads:', len(x)
    print 'Max number of comments:', xmax
    print 'Min number of comments:', xmin
    
    (n, bins, patches) = plt.hist(x, bins=range(xmin, xmax, 1), cumulative=True, normed=True)
    #(n, bins, patches) = plt.hist(x, bins=5)
    
    #plt.show()
    
    num_level = 2
    popularity_level = get_popularity_level(num_level, bins, n)
    
    print 'Popularity level: '
    print popularity_level
    
    # tell me how many threads each level has
    level_statics(popularity_level, x)

示例#4

0

显示文件

文件： relevance_plot.py 项目： hitalex/popularity

def main(group_id, factor_index):
    topic_list_path = 'data-dynamic/TopicList-' + group_id + '.txt'
    topic_list = load_id_list(topic_list_path)
    print 'Num of topics loaded:', len(topic_list)
    
    popularity = [0] * len(topic_list)
    factor = [0] * len(topic_list)
    index = 0
    base_path = 'data-dynamic/' + group_id + '/'
    for topic_id in topic_list:
        path = base_path + topic_id + '.txt'
        if not os.path.exists(path):
            continue
        
        f = open(path, 'r')
        try:
            # get the thread publish date
            line = f.readline().strip()
            if line == '':
                f.close()
                continue
                
            seg_list = line.split('[=]')
            thread_pubdate = datetime.strptime(seg_list[2], '%Y-%m-%d %H:%M:%S')
            
            curr_comment_cnt = 0
            for line in f:
                line = line.strip()
                seg_list = line.split('[=]')
                pubdate = datetime.strptime(seg_list[2], '%Y-%m-%d %H:%M:%S')
                curr_comment_cnt += 1
                
                if pubdate < thread_pubdate + AFTER_PUBLISHING_TIME:
                    continue
                    
                factor_value = float(seg_list[factor_index])
                
                popularity[index] = curr_comment_cnt
                factor[index] = factor_value
                index += 1
                break
        except Exception as e:
            print 'Exception occured:', e
            print 'Errors in topic:', topic_id
        finally:
            f.close()
        
    
    print 'Number of pairs:', index
    popularity = popularity[:index]
    factor = factor[:index]
    
    factor_relevance_plot(popularity, factor)

示例#5

0

显示文件

文件： histogram_lifespan.py 项目： hitalex/popularity

def main(group_id):
    latest_comment_time = DEADLINE - MAX_SILENT_TIME
    
    topic_list_path = '/home/kqc/dataset/douban-group/TopicList-' + group_id + '.txt'
    topic_list = load_id_list(topic_list_path)
    print 'Num of topics loaded:', len(topic_list)
    
    lifespan_list = [0] * len(topic_list)
    index = 0
    base_path = '/home/kqc/dataset/douban-group/' + group_id + '/'
    for topic_id in topic_list:
        path = base_path + topic_id + '-info.txt'
        if not os.path.exists(path):
            continue
        
        # get the last line of a file
        line = os.popen("tail -1 " + path).readlines()[0]
        line = line.strip()
        if line == '':
            continue
        seg_list = line.split('[=]')
        # 最后一个comment的发布时间
        last_comment_pubdate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S')
        if last_comment_pubdate > latest_comment_time:
            continue
        
        # get the first line
        line = os.popen("head -1 " + path).readlines()[0]
        line = line.strip()
        seg_list = line.split('[=]')
        thread_pubdate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S')
        
        #if total_comment < MIN_COMMENT or DEADLINE < thread_pubdate + target_date:
        #    continue
        
        lifespan = last_comment_pubdate - thread_pubdate
        
        # 如果生命周期大约30天，则不考虑
        if lifespan.total_seconds() > 90 * seconds_in_one_day:
            continue
        
        lifespan_list[index] = lifespan
        index += 1
        
        
    lifespan_list = lifespan_list[:index]
    
    plot_histogram(lifespan_list)

示例#6

0

显示文件

文件： plot_growth.py 项目： hitalex/douban-group-crawler

def main(group_id):
    # 读取topic id list
    path = 'data/' + group_id + '/' + group_id + '-TopicList.txt'
    topic_id_list = load_id_list(path)
    #topic_id_list = ['34029324']
    
    for topic_id in topic_id_list:
        path = 'data/' + group_id + '/' + topic_id + '-content.txt'
        topic_pubdate = None
        try:
            with codecs.open(path, 'r', 'utf-8') as f:
                content = f.read()
                seg_list = content.split('[=]')
                num_comment = int(seg_list[5])
                # topic publishing date
                topic_puddate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S')
                if num_comment < threshold:
                    continue
        except IOError:
            continue
            
        path = 'data/' + group_id + '/' + topic_id + '-comment.txt'
        f = codecs.open(path, 'r', 'utf-8')
        date_list = [topic_puddate]
        row = ''
        for line in f:
            if line != '[*ROWEND*]\n':
                row = row + line
            else:
                seg_list = row.split('[=]')
                date = seg_list[4]
                date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                date_list.append(date)
                row = ''
            
        sorted(date_list, reverse = True)
        
        count = len(date_list)
        print 'topic id:', topic_id
        print 'Number of comments: %d\n' % (count-1)
        
        #plot_popularity(date_list)
        plot_granularity_popularity(date_list, freq = 'D')

示例#7

0

显示文件

文件： statics.py 项目： hitalex/popularity

def main(group_id):
    import pickle
    #topiclist_path = 'data-dynamic/TopicList-' + group_id + '-shuffled.txt'
    topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset
    topic_list = load_id_list(topiclist_path)
    print 'Number of total topics loaded: ', len(topic_list)
    
    """
    # 存储中间结果
    num_comment_list, num_lifecycle_list = collect_comments_lifecycle(group_id, topic_list)    
    print 'Number of threads:', len(num_comment_list)
    f = open('pickle/comment-lifecycle-dist-tianya.pickle', 'w')
    pickle.dump([num_comment_list, num_lifecycle_list], f)
    f.close()
    """
    
    f = open('pickle/comment-lifecycle-dist-tianya.pickle', 'r')
    num_comment_list, num_lifecycle_list = pickle.load(f)
    f.close()
    
    #import ipdb; ipdb.set_trace()
    #plot_histogram(num_comment_list, '')    
    #plot_histogram(num_lifecycle_list, '')
    
    fig = plt.figure()
    ax1 = plt.subplot(121) # 左边的图
    ax2 = plt.subplot(122) # 右边的图
    
    print 'Number of elements: ', len(num_comment_list)
    #plot_loglog(num_comment_list, u'', u'评论数', u'讨论帖数量')
    plot_loglog(ax1, num_comment_list, '', 'Number of comments', 'Number of threads')
    
    
    for i in range(len(num_lifecycle_list)):
        #num_lifecycle_list[i] = int(num_lifecycle_list[i] * 24)
        num_lifecycle_list[i] = int(num_lifecycle_list[i])
     
    #plot_loglog(num_lifecycle_list, u'', u'生命周期长度', u'讨论帖数量')
    plot_loglog(ax2, num_lifecycle_list, '', 'Length of lifecycle(days)', 'Number of threads')
    
    plt.show()

示例#8

0

显示文件

文件： plot_growth.py 项目： hitalex/popularity

def main(group_id):
    # 读取topic id list
    base_path = '/home/kqc/dataset/douban-group/'
    path = base_path + '/TopicList-' + group_id + '.txt'
    topic_id_list = load_id_list(path)
    
    topic_id_list = ['21108362']
    for topic_id in topic_id_list:
        path = base_path + group_id + '/' + topic_id + '-info.txt'
        topic_pubdate = None

        f = codecs.open(path, 'r', 'utf-8')
        content = f.readline()
        seg_list = content.split('[=]')
        num_comment = int(seg_list[5])
        # topic publishing date
        topic_puddate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S')
        if num_comment < threshold:
            continue
            
        date_list = [topic_puddate]
        for line in f:
            seg_list = line.split('[=]')
            date = seg_list[4]
            date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
            date_list.append(date)
            
        f.close()
        
        sorted(date_list, reverse = True)
        
        count = len(date_list)
        print 'topic id:', topic_id
        print 'Number of comments: %d\n' % (count-1)
        
        #plot_popularity(date_list)
        plot_granularity_popularity(date_list, freq = 'D')

示例#9

0

显示文件

文件： quote.py 项目： hitalex/tianya-popularity

                    print 'Find missing quote in post: ', post_id
                    print 'Comment content: ', content
                else:
                    tf.write(sline + '\n')
                    print 'Error in finding quote in post: ', post_id
                    print 'Matched quote: ', m.group()
                    print 'Comment content: ', content
            else:
                tf.write(sline + '\n')
        
        sf.close()
        tf.close()
        
if __name__ == '__main__':
    import codecs
    import sys
    writer = codecs.getwriter('utf8')
    sys.stdout = writer(sys.stdout) # for writing to pipes
    
    section_id = 'free'
    base_path = '/home/kqc/dataset/tianya-forum/'
    
    source_path = base_path + 'test/'
    target_path = '/home/kqc/dataset/tianya-forum-fixquote/' + section_id + '/'
    
    post_list_path = base_path + section_id + '-post-list.txt'
    post_list = load_id_list(post_list_path)
    
    post_list = ['3164851']
    fix_quote_info(source_path, target_path, post_list)

示例#10

0

显示文件

文件： gen_dynamic_feature.py 项目： hitalex/tianya-popularity

def main(section_id, base_path):
    """ 生成动态因素
    """
    post_list_path = base_path + section_id + '-post-list.txt'
    topic_list = load_id_list(post_list_path)
    topic_list = list(set(topic_list)) # 删除重复post id
    
    target_base_path = 'data-dynamic/'
    
    #topic_list = ['1377621']
    # 这里不用post_id替换topic_id的原因是：feature_dict中使用了“topic_id”作为key
    valid_topic_list = [] # 真正抽取特征的post id列表
    for topic_id in topic_list:
        path = base_path + section_id + '/' + topic_id + '-info.txt'
        if not os.path.exists(path):
            continue
        
        tpath = target_base_path + section_id + '/' + topic_id + '.txt'
        # 如果目标输出已经存在，则忽略
        if os.path.exists(tpath):
            continue
        
        f = codecs.open(path, 'r', 'utf8')
        print 'Extracting features from post file: ', path
        
        # 记录需要存储的字符串，该字符串可能较大，最大可能几M
        result_string = ""
        
        # 第一行为post本身的信息
        line = f.readline().strip()
        seg_list = line.split('[=]')
        
        if len(seg_list) != 8:
            print 'Error in the first line of topic file: ', path
            f.close()
            continue
        
        lz = seg_list[2] # LZ id for author reply to, topic_id for comment tree
        lz_user_name = seg_list[3] # 楼主的用户名
        pubdate = seg_list[5]
        num_comment = int(seg_list[6]) # NOTE：可能该文件中存储的评论数不足此数值，由于之前抓取错误导致
        
        # first line: topic info
        feature_dict = dict()
        feature_dict['topic_id'] = topic_id
        feature_dict['lz'] = lz
        feature_dict['pubdate'] = pubdate
        feature_dict['num_comment'] = num_comment
        #tf.write(str(feature_dict) + '\n')
        result_string += (str(feature_dict) + '\n')
        
        #NOTE: 因为可能在抓取时保存不完整，真正存在的评论数量可能少于第一行所表明的数字
        # 关于这一点需要检查，如果不合格，则删除
        
        # build two graphs
        comment_tree = Graph(directed=True)
        author_reply = Graph(directed=True)
        
        # 构建一个二模网络：两类节点分别是评论（包括原帖）和用户
        # 如果comment和author之间相连，则：1）author写了comment（包括原作者写了帖子），
        # 2）author评论了comment（这个comment包括原帖）
        # comment的type为False，author的type为True
        comment_author_bigraph = Graph(directed=False)
        comment_author_bigraph.add_vertex(topic_id, type=False)
        comment_author_bigraph.add_vertex(lz, type=True)
        comment_author_bigraph.add_edge(topic_id, lz)

        comment_dict = dict() # map comment id to graph index
        comment_dict[topic_id] = comment_tree.vcount()
        comment_tree.add_vertex(topic_id, date=pubdate, author=lz, depth=0)
        
        author_dict = dict() # map author_id to graph index
        author_dict[lz] = author_reply.vcount()
        author_reply.add_vertex(lz)
        
        flag = True     # 标记该文件是否合乎规范
        max_depth = 0
        # 用于描述comment tree讨论的激烈程度：根节点为0，处于depth为1的节点贡献是1，依次类推
        weighted_depth_sum = 0
        current_comment_count = 0 # 记录当前的comment数量
        for line in f:
            # 将所有的feature放入feature_dict，可以不考虑顺序
            feature_dict = dict()
            line = line.strip()
            seg_list = line.split('[=]')
            
            if len(seg_list) != 8:
                print 'Error in the comment line of topic file: ', path
                flag = False
                break
            
            cid = seg_list[0] # 评论ID
            pid = seg_list[3] # 用户的ID
            uname = seg_list[4] # 用户昵称
            pubdate = seg_list[5] # 发表时间
            replyto = seg_list[6] # 评论引用
            
            feature_dict['cid'] = cid
            feature_dict['pid'] = pid
            feature_dict['pubdate'] = pubdate
            feature_dict['replyto'] = replyto # 回复的comment的cid
            
            comment_dict[cid] = comment_tree.vcount()
            comment_tree.add_vertex(cid, date=pubdate, author=lz)
            current_comment_count += 1
            
            feature_dict['current_comment_count'] = current_comment_count
            
            comment_author_bigraph.add_vertex(cid, type=False)
            
            # if this author has once commented, it should be in author_dict
            if not pid in author_dict:
                author_dict[pid] = author_reply.vcount()
                author_reply.add_vertex(pid)
                comment_author_bigraph.add_vertex(pid, type=True)
                
            # the author-of relationship
            comment_author_bigraph.add_edge(pid, cid)
            
            replyto_pid = ''
            commenton_cid = ''
            parent_index = 0 # 在评论树上，该节点的父节点
            if replyto == '':
                commenton_cid = topic_id
                parent_index = comment_dict[commenton_cid]
                replyto_pid = lz
            else:
                commenton_cid = replyto
                parent_index = comment_dict[commenton_cid]
                replyto_pid = comment_tree.vs[parent_index]['author']
            
            comment_tree.add_edge(cid, commenton_cid)
            comment_author_bigraph.add_edge(pid, commenton_cid)
            
            # 为cid节点添加depth属性
            index = comment_dict[cid]
            if comment_tree.vs[parent_index]['depth'] == None:
                #TODO: 未知错误，举例：3211910
                flag = False
                break
            else:
                current_depth = comment_tree.vs[index]['depth'] = comment_tree.vs[parent_index]['depth'] + 1
            
            weighted_depth_sum += current_depth
            avg_weighted_depth_sum = weighted_depth_sum * 1.0 / current_comment_count
            if current_depth > max_depth:
                max_depth = current_depth
            
            # 如果是回复自己，则忽略
            if pid != replyto_pid:
                # 如果 pid指向replyto_pid已经有链接，则不考虑再次添加
                v1 = author_dict[pid]
                v2 = author_dict[replyto_pid]
                if author_reply.get_eid(v1, v2, directed=True, error=False) == -1:
                    author_reply.add_edge(v1, v2)
            
            # number of current participating commenters
            num_authors = author_reply.vcount()
            feature_dict['num_authors']             = num_authors
            
            # write statics in target file
            mean_degree             = sum(author_reply.degree()) * 1.0 / author_reply.vcount()
            avg_local_transitivity  = author_reply.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity
            clustering_coefficient  = author_reply.transitivity_undirected(mode='zero')
            assortativity           = author_reply.assortativity_degree(directed=False)
            num_componnet           = len(author_reply.components(mode=WEAK))
            reply_density           = author_reply.density(loops=True)
            # cohesion和adhesion都不合适，因为几乎每个图都有一条度为1的边
            #cohesion = author_reply.cohesion(neighbors='ignore')
            #adhesion = author_reply.adhesion()
            
            # Ref: http://igraph.sourceforge.net/doc/python/igraph.Graph-class.html#cohesive_blocks
            # cohesive_blocks only works on undirected graphs
            #author_reply_cohesive_block = author_reply.cohesive_blocks()
            #author_reply_max_cohesions = author_reply_cohesive_block.max_cohesions()

            feature_dict['mean_degree']             = mean_degree
            feature_dict['avg_local_transitivity']  = avg_local_transitivity
            feature_dict['clustering_coefficient']  = clustering_coefficient
            feature_dict['assortativity']           = assortativity
            feature_dict['num_componnet']           = num_componnet
            feature_dict['reply_density']           = reply_density
            # author-reply graph group cohesiveness
            #feature_dict['author_reply_max_cohesions'] = author_reply_max_cohesions
            
            # dynamic factor from WWW'13, Bao 
            tree_density            = comment_tree.density(loops=False)
            average_path_length     = comment_tree.average_path_length(directed=False) # do not consider directed graphs
            #diffusion_depth = comment_tree.diameter(directed=True) # diffusion depth for a tree, i.e the depth of a tree
            diffusion_depth = max_depth
            # comment tree related factors
            feature_dict['tree_density']            = tree_density
            feature_dict['diffusion_depth']         = diffusion_depth
            feature_dict['avg_weighted_depth_sum']  = avg_weighted_depth_sum
            feature_dict['avg_path_length']         = average_path_length   # Wiener index, the average distance between all paris of nodes in a cascade
            
            # the comment-author two model network properties
            ca_mean_degree              = sum(comment_author_bigraph.degree()) * 1.0 / comment_author_bigraph.vcount()
            ca_avg_local_transitivity   = comment_author_bigraph.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity
            ca_clustering_coefficient   = comment_author_bigraph.transitivity_undirected(mode='zero')
            ca_assortativity            = comment_author_bigraph.assortativity_degree(directed=False)
            ca_num_componnet            = len(comment_author_bigraph.components(mode=WEAK))
            ca_reply_density            = comment_author_bigraph.density(loops=True)
            #comment_author_cohesive_block = comment_author_bigraph.cohesive_blocks()
            #ca_max_cohesions = comment_author_cohesive_block.max_cohesions()
            
            feature_dict['ca_mean_degree']             = ca_mean_degree
            feature_dict['ca_avg_local_transitivity']  = ca_avg_local_transitivity
            feature_dict['ca_clustering_coefficient']  = ca_clustering_coefficient
            feature_dict['ca_assortativity']           = ca_assortativity
            feature_dict['ca_num_componnet']           = ca_num_componnet
            feature_dict['ca_reply_density']           = ca_reply_density
            #feature_dict['ca_max_cohesions']          = ca_max_cohesions
            
            # write feature dict to file
            #tf.write(str(feature_dict) + '\n')
            result_string += (str(feature_dict) + '\n')
            
            # do not consider threads who has more than 1000 comments
            if current_comment_count >= MAX_COMMENT:
                break

        # 如果未达到最大评论数，而且当前的评论数不足第一行所宣称的总评论数，则忽略该post
        if current_comment_count < MAX_COMMENT and num_comment != current_comment_count:
            flag = False
            
        # print dynamic feature
        #plot(comment_tree)
        #plot(author_reply)
        
        #ipdb.set_trace()
        
        #print author_reply.transitivity_undirected(mode='zero')
        #print author_reply.transitivity_avglocal_undirected(mode='zero') # the avg of local transitivity
        
        #print author_reply.assortativity_degree(False)
        f.close()
        
        if flag:
            valid_topic_list.append(topic_id)
            # 输出到文件
            print 'Saving: ', tpath
            tf = codecs.open(tpath, 'w', 'utf8')
            tf.write(result_string)
            tf.close()
            
    # 保存所有有效的讨论帖id
    path = target_base_path + section_id + '-post-list-feature.txt'
    f = codecs.open(path, 'w', 'utf8')
    for topic_id in valid_topic_list:
        f.write(topic_id + '\n')
    f.close()

示例#11

0

显示文件

文件： gen_dynamic_feature.py 项目： hitalex/popularity

def main(group_id):
    base_path = '/home/kqc/dataset/douban-group/'
    #group_id = 'qiong'
    
    topic_path = base_path + 'TopicList-' + group_id + '.txt'
    topic_list = load_id_list(topic_path)
    
    target_base_path = 'data-dynamic/'
    
    #topic_list = ['1377621']
    for topic_id in topic_list:
        path = base_path + group_id + '/' + topic_id + '-info.txt'
        if not os.path.exists(path):
            continue
            
        print 'Reading topic file: ', path
        tpath = target_base_path + group_id + '/' + topic_id + '.txt'
        
        f = open(path, 'r')
        tf = open(tpath, 'w')
                
        # read topic info
        line = f.readline().strip()
        seg_list = line.split('[=]')
        
        if len(seg_list) < 7:
            print 'Error in the first line of topic file: ', path
            f.close()
            tf.close()
            continue
        
        lz = seg_list[2] # LZ id for author reply to, topic_id for comment tree
        pubdate = seg_list[4]
        num_comment = int(seg_list[5])
        
        # first line: topic info
        feature_dict = dict()
        feature_dict['topic_id'] = topic_id
        feature_dict['lz'] = lz
        feature_dict['pubdate'] = pubdate
        feature_dict['num_comment'] = num_comment
        tf.write(str(feature_dict) + '\n')
        
        # build two graphs
        comment_tree = Graph(directed=True)
        author_reply = Graph(directed=True)
        
        # 构建一个二模网络：两类节点分别是评论（包括原帖）和用户
        # 如果comment和author之间相连，则：1）author写了comment（包括原作者写了帖子），
        # 2）author评论了comment（这个comment包括原帖）
        # comment的type为False，author的type为True
        comment_author_bigraph = Graph(directed=False)
        comment_author_bigraph.add_vertex(topic_id, type=False)
        comment_author_bigraph.add_vertex(lz, type=True)
        comment_author_bigraph.add_edge(topic_id, lz)

        comment_dict = dict() # map comment id to graph index
        comment_dict[topic_id] = comment_tree.vcount()
        comment_tree.add_vertex(topic_id, date=pubdate, author=lz, depth=0)
        
        author_dict = dict() # map author_id to graph index
        author_dict[lz] = author_reply.vcount()
        author_reply.add_vertex(lz)
        
        max_depth = 0
        # 用于描述comment tree讨论的激烈程度：根节点为0，处于depth为1的节点贡献是1，依次类推
        weighted_depth_sum = 0
        current_comment_count = 0 # 记录当前的comment数量
        for line in f:
            # 将所有的feature放入feature_dict，可以不考虑顺序
            feature_dict = dict()
            line = line.strip()
            seg_list = line.split('[=]')
            
            if len(seg_list) < 7:
                print 'Error in the comment line of topic file: ', path
                break
            
            cid = seg_list[0]
            pid = seg_list[3]
            pubdate = seg_list[4]
            replyto = seg_list[5]
            
            feature_dict['cid'] = cid
            feature_dict['pid'] = pid
            feature_dict['pubdate'] = pubdate
            feature_dict['replyto'] = replyto # 回复的comment的cid
            
            comment_dict[cid] = comment_tree.vcount()
            comment_tree.add_vertex(cid, date=pubdate, author=lz)
            current_comment_count += 1
            
            feature_dict['current_comment_count'] = current_comment_count
            
            comment_author_bigraph.add_vertex(cid, type=False)
            
            # if this author has once commented, it should be in author_dict
            if not pid in author_dict:
                author_dict[pid] = author_reply.vcount()
                author_reply.add_vertex(pid)
                comment_author_bigraph.add_vertex(pid, type=True)
                
            # the author-of relationship
            comment_author_bigraph.add_edge(pid, cid)
            
            replyto_pid = ''
            commenton_cid = ''
            if replyto == '':
                commenton_cid = topic_id
                parent_index = comment_dict[commenton_cid]
                #comment_tree.add_edge(cid, topic_id)
                replyto_pid = lz
                #author_reply.add_edge(pid, lz)
            else:
                commenton_cid = replyto
                #comment_tree.add_edge(cid, replyto)
                parent_index = comment_dict[commenton_cid]
                replyto_pid = comment_tree.vs[parent_index]['author']
                #author_reply.add_edge(pid, comment_tree.vs[index]['author'])
            
            comment_tree.add_edge(cid, commenton_cid)
            comment_author_bigraph.add_edge(pid, commenton_cid)
            
            # 为cid节点添加depth属性
            index = comment_dict[cid]
            current_depth = comment_tree.vs[index]['depth'] = comment_tree.vs[parent_index]['depth'] + 1
            weighted_depth_sum += current_depth
            avg_weighted_depth_sum = weighted_depth_sum * 1.0 / current_comment_count
            if current_depth > max_depth:
                max_depth = current_depth
            
            # 如果是回复自己，则忽略
            if pid != replyto_pid:
                # 如果 pid指向replyto_pid已经有链接，则不考虑再次添加
                v1 = author_dict[pid]
                v2 = author_dict[replyto_pid]
                if author_reply.get_eid(v1, v2, directed=True, error=False) == -1:
                    author_reply.add_edge(v1, v2)
            
            # number of participating commenters
            num_authors = author_reply.vcount()
            feature_dict['num_authors']             = num_authors    
            
            # write statics in target file
            mean_degree             = sum(author_reply.degree()) * 1.0 / author_reply.vcount()
            avg_local_transitivity  = author_reply.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity
            clustering_coefficient  = author_reply.transitivity_undirected(mode='zero')
            assortativity           = author_reply.assortativity_degree(directed=False)
            num_componnet           = len(author_reply.components(mode=WEAK))
            reply_density           = author_reply.density(loops=True)
            # cohesion和adhesion都不合适，因为几乎每个图都有一条度为1的边
            #cohesion = author_reply.cohesion(neighbors='ignore')
            #adhesion = author_reply.adhesion()
            
            # Ref: http://igraph.sourceforge.net/doc/python/igraph.Graph-class.html#cohesive_blocks
            # cohesive_blocks only works on undirected graphs
            #author_reply_cohesive_block = author_reply.cohesive_blocks()
            #author_reply_max_cohesions = author_reply_cohesive_block.max_cohesions()

            feature_dict['mean_degree']             = mean_degree
            feature_dict['avg_local_transitivity']  = avg_local_transitivity
            feature_dict['clustering_coefficient']  = clustering_coefficient
            feature_dict['assortativity']           = assortativity
            feature_dict['num_componnet']           = num_componnet
            feature_dict['reply_density']           = reply_density
            # author-reply graph group cohesiveness
            #feature_dict['author_reply_max_cohesions'] = author_reply_max_cohesions
            
            # dynamic factor from WWW'13, Bao 
            tree_density            = comment_tree.density(loops=False)
            average_path_length     = comment_tree.average_path_length(directed=False) # do not consider directed graphs
            #diffusion_depth = comment_tree.diameter(directed=True) # diffusion depth for a tree, i.e the depth of a tree
            diffusion_depth = max_depth
            # comment tree related factors
            feature_dict['tree_density']            = tree_density
            feature_dict['diffusion_depth']         = diffusion_depth
            feature_dict['avg_weighted_depth_sum']  = avg_weighted_depth_sum
            feature_dict['avg_path_length']         = average_path_length   # Wiener index, the average distance between all paris of nodes in a cascade
                        
            # the comment-author two model network properties
            ca_mean_degree              = sum(comment_author_bigraph.degree()) * 1.0 / comment_author_bigraph.vcount()
            ca_avg_local_transitivity   = comment_author_bigraph.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity
            ca_clustering_coefficient   = comment_author_bigraph.transitivity_undirected(mode='zero')
            ca_assortativity            = comment_author_bigraph.assortativity_degree(directed=False)
            ca_num_componnet            = len(comment_author_bigraph.components(mode=WEAK))
            ca_reply_density            = comment_author_bigraph.density(loops=True)
            #comment_author_cohesive_block = comment_author_bigraph.cohesive_blocks()
            #ca_max_cohesions = comment_author_cohesive_block.max_cohesions()
            
            feature_dict['ca_mean_degree']             = ca_mean_degree
            feature_dict['ca_avg_local_transitivity']  = ca_avg_local_transitivity
            feature_dict['ca_clustering_coefficient']  = ca_clustering_coefficient
            feature_dict['ca_assortativity']           = ca_assortativity
            feature_dict['ca_num_componnet']           = ca_num_componnet
            feature_dict['ca_reply_density']           = ca_reply_density
            #feature_dict['ca_max_cohesions']          = ca_max_cohesions
            
            # write feature dict to file
            tf.write(str(feature_dict) + '\n')
            # do not consider threads who has more than 1000 comments
            if current_comment_count >= MAX_COMMENT:
                break

        # print dynamic feature
        #plot(comment_tree)
        #plot(author_reply)
        
        #ipdb.set_trace()
        
        #print author_reply.transitivity_undirected(mode='zero')
        #print author_reply.transitivity_avglocal_undirected(mode='zero') # the avg of local transitivity
        
        #print author_reply.assortativity_degree(False)
        
        f.close()
        tf.close()

示例#12

0

显示文件

文件： popularity_baseline.py 项目： hitalex/popularity

def main(group_id):

    topiclist_path = 'data-dynamic/TopicList-' + group_id + '-filtered.txt'
    topic_list = load_id_list(topiclist_path)
    print 'Number of total topics loaded: ', len(topic_list)

    # set the pre-computed popularity level
    # 未来的最大评论数可能超过pop_level的最大值
    # 注意：这里将最小的popularity值，即0，忽略了
    #pop_level = [8, 13, 23, 43, float('inf')]  # group: zhuangb
    pop_level = [25, 50, float('inf')]  # group: zhuangb
    #pop_level = [25, 50, float('inf')]      # group: buybook
    #pop_level = [30, float('inf')]      # group: buybook
    
    # prediction_date 的含义为：在帖子发布 prediction_date 时间后，开始预测
    # target_date 的含义为：预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=5)
    prediction_date = timedelta(hours=10*5)
    response_time = timedelta(hours=50)
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    alpha = 1.5
    percentage_threshold = 0.7
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    #print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    # 调整所有帖子的顺序
    # 在调试阶段，暂且不shuffle dataset，避免每次结果都不一样
    #shuffle(dataset)
    
    print 'Down-sampling the datasets...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)
    
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    #num_level = len(pop_level)
    #raw_input()
    
    #import ipdb
    #ipdb.set_trace()
        
    print 'The proposed model:'
    k = 3
    num_level = 2
    num_factor = len(train_set[0][1][1])
    
    print 'Classify test instances...'
    y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list = classify(train_set, test_set, k, num_level)
    # evaluate results
    print 'Number of give-ups: ', len(give_up_list)
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    #save_predictions(prediction_list, y_pred, factor_name = 'num_authors')
    #save_predictions(prediction_list, y_true, factor_name = 'all')
    
    comment_RSE_evaluation(comment_true, comment_pred)
    
    #print 'The class prior:', prior_score
    
    from svm_model import svm_model
    print 'Building a svm model...'
    y_true, y_pred = svm_model(train_set, test_set)
    classification_evaluation(y_true, y_pred)

    # 查看对于不同的factor，它们在不同的ratio上的预测结果
    #from utils import ratio_accuracy_distribution_plot
    #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density')
    
    # S-H model
    print '\nThe S-H model:'
    baseline_train_set = comment_count_dataset[:train_cnt]
    baseline_test_set = comment_count_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha)
    # drop some intances with cat = 0
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    
    print '\nML model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nMLR model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nkNN method:'
    k = 1
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, k, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    
    print "\nBao's method:"
    Bao_train_set = Bao_dataset[:train_cnt]
    Bao_test_set = Bao_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)

示例#13

0

显示文件

文件： cross_validation_k.py 项目： hitalex/popularity

        IPW_acc_list.append(IPW_acc)

    return IPW_acc_list


if __name__ == "__main__":
    import sys

    group_id = sys.argv[1]

    topiclist_path = "data-dynamic/TopicList-" + group_id + "-filtered.txt"  # for douban-group
    # topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset

    print "Reading topic list from file:", topiclist_path
    topic_list = load_id_list(topiclist_path)
    print "Number of total topics loaded: ", len(topic_list)

    # for test
    # topic_list = topic_list[:50]

    threshold_p = 0.7
    prediction_date_tr = timedelta(hours=50)
    response_time_delta = timedelta(hours=25)
    # cvk_list = [1, 3, 5, 7]
    cvk_list = [9]
    for cvk in cvk_list:
        print "CV for k: ", cvk
        IPW_acc_list = select_k(group_id, topic_list, threshold_p, prediction_date_tr, response_time_delta, cvk)
        print IPW_acc_list
        print "avg_IPW_acc: ", sum(IPW_acc_list) * 1.0 / len(IPW_acc_list)