def main(group_id): #topiclist_path = 'data-dynamic/TopicList-' + group_id + 'shuffled-.txt' # for douban dataset topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset topic_list = load_id_list(topiclist_path) print 'Number of total topics loaded: ', len(topic_list) pop_level = [25, 50, float('inf')] # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测 # target_date 的含义为:预测在 target_date 处的评论数量 # 以上两个参数可以调节 # 设置采样的间隔 gaptime = timedelta(hours=3) prediction_date = timedelta(hours=10*3) response_time = timedelta(hours=24) target_date = prediction_date + response_time # 计算每个topic在prediction_date前会有多少个interval num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds()) print 'Number of features: ', num_feature alpha = 1.5 percentage_threshold = 0.7 print 'Generating training and test dataset...' dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \ topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold) # 保存那些经过筛选的topic id #save_filtered_topics(group_id, dataset) print 'Ploting factor propagation' #factor_propagation_plot(dataset, num_feature) #topic_propagation_plot(dataset, num_feature) #return # 调整所有帖子的顺序 # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样 #shuffle(dataset) # 注意:每次使用的数据集是不同的 total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1] dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \ comment_count_dataset, Bao_dataset, category_count_list) print 'After down sampling...' total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
def main(group_id): topiclist_path = 'data-dynamic/TopicList-' + group_id + '-shuffled.txt' topic_list = load_id_list(topiclist_path) random.shuffle(topic_list) topiclist_path = 'data-dynamic/TopicList-' + group_id + '-shuffled.txt' f = open(topiclist_path, 'w') for topic_id in topic_list: f.write(topic_id + '\n') f.close()
def main(group_id): topic_list = load_id_list('data-dynamic/TopicList-' + group_id + '.txt') print 'Topics id loaded:', len(topic_list) base_path = 'data-dynamic/' + group_id + '/' x = [0] * len(topic_list) index = 0 for topic_id in topic_list: path = base_path + topic_id + '.txt' if not os.path.exists(path): continue print 'Processing file: ', path f = open(path, 'r') #print 'Reading file: ', path line = f.readline().strip() seg_list = line.split('[=]') num_comment = int(seg_list[3]) # 过滤掉那些总的评论数小于某个特定值的 if num_comment < MIN_COMMENT: continue x[index] = num_comment index += 1 f.close() x = x[:index] xmax = max(x) xmin = min(x) print 'Number of threads:', len(x) print 'Max number of comments:', xmax print 'Min number of comments:', xmin (n, bins, patches) = plt.hist(x, bins=range(xmin, xmax, 1), cumulative=True, normed=True) #(n, bins, patches) = plt.hist(x, bins=5) #plt.show() num_level = 2 popularity_level = get_popularity_level(num_level, bins, n) print 'Popularity level: ' print popularity_level # tell me how many threads each level has level_statics(popularity_level, x)
def main(group_id, factor_index): topic_list_path = 'data-dynamic/TopicList-' + group_id + '.txt' topic_list = load_id_list(topic_list_path) print 'Num of topics loaded:', len(topic_list) popularity = [0] * len(topic_list) factor = [0] * len(topic_list) index = 0 base_path = 'data-dynamic/' + group_id + '/' for topic_id in topic_list: path = base_path + topic_id + '.txt' if not os.path.exists(path): continue f = open(path, 'r') try: # get the thread publish date line = f.readline().strip() if line == '': f.close() continue seg_list = line.split('[=]') thread_pubdate = datetime.strptime(seg_list[2], '%Y-%m-%d %H:%M:%S') curr_comment_cnt = 0 for line in f: line = line.strip() seg_list = line.split('[=]') pubdate = datetime.strptime(seg_list[2], '%Y-%m-%d %H:%M:%S') curr_comment_cnt += 1 if pubdate < thread_pubdate + AFTER_PUBLISHING_TIME: continue factor_value = float(seg_list[factor_index]) popularity[index] = curr_comment_cnt factor[index] = factor_value index += 1 break except Exception as e: print 'Exception occured:', e print 'Errors in topic:', topic_id finally: f.close() print 'Number of pairs:', index popularity = popularity[:index] factor = factor[:index] factor_relevance_plot(popularity, factor)
def main(group_id): latest_comment_time = DEADLINE - MAX_SILENT_TIME topic_list_path = '/home/kqc/dataset/douban-group/TopicList-' + group_id + '.txt' topic_list = load_id_list(topic_list_path) print 'Num of topics loaded:', len(topic_list) lifespan_list = [0] * len(topic_list) index = 0 base_path = '/home/kqc/dataset/douban-group/' + group_id + '/' for topic_id in topic_list: path = base_path + topic_id + '-info.txt' if not os.path.exists(path): continue # get the last line of a file line = os.popen("tail -1 " + path).readlines()[0] line = line.strip() if line == '': continue seg_list = line.split('[=]') # 最后一个comment的发布时间 last_comment_pubdate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S') if last_comment_pubdate > latest_comment_time: continue # get the first line line = os.popen("head -1 " + path).readlines()[0] line = line.strip() seg_list = line.split('[=]') thread_pubdate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S') #if total_comment < MIN_COMMENT or DEADLINE < thread_pubdate + target_date: # continue lifespan = last_comment_pubdate - thread_pubdate # 如果生命周期大约30天,则不考虑 if lifespan.total_seconds() > 90 * seconds_in_one_day: continue lifespan_list[index] = lifespan index += 1 lifespan_list = lifespan_list[:index] plot_histogram(lifespan_list)
def main(group_id): # 读取topic id list path = 'data/' + group_id + '/' + group_id + '-TopicList.txt' topic_id_list = load_id_list(path) #topic_id_list = ['34029324'] for topic_id in topic_id_list: path = 'data/' + group_id + '/' + topic_id + '-content.txt' topic_pubdate = None try: with codecs.open(path, 'r', 'utf-8') as f: content = f.read() seg_list = content.split('[=]') num_comment = int(seg_list[5]) # topic publishing date topic_puddate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S') if num_comment < threshold: continue except IOError: continue path = 'data/' + group_id + '/' + topic_id + '-comment.txt' f = codecs.open(path, 'r', 'utf-8') date_list = [topic_puddate] row = '' for line in f: if line != '[*ROWEND*]\n': row = row + line else: seg_list = row.split('[=]') date = seg_list[4] date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') date_list.append(date) row = '' sorted(date_list, reverse = True) count = len(date_list) print 'topic id:', topic_id print 'Number of comments: %d\n' % (count-1) #plot_popularity(date_list) plot_granularity_popularity(date_list, freq = 'D')
def main(group_id): import pickle #topiclist_path = 'data-dynamic/TopicList-' + group_id + '-shuffled.txt' topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset topic_list = load_id_list(topiclist_path) print 'Number of total topics loaded: ', len(topic_list) """ # 存储中间结果 num_comment_list, num_lifecycle_list = collect_comments_lifecycle(group_id, topic_list) print 'Number of threads:', len(num_comment_list) f = open('pickle/comment-lifecycle-dist-tianya.pickle', 'w') pickle.dump([num_comment_list, num_lifecycle_list], f) f.close() """ f = open('pickle/comment-lifecycle-dist-tianya.pickle', 'r') num_comment_list, num_lifecycle_list = pickle.load(f) f.close() #import ipdb; ipdb.set_trace() #plot_histogram(num_comment_list, '') #plot_histogram(num_lifecycle_list, '') fig = plt.figure() ax1 = plt.subplot(121) # 左边的图 ax2 = plt.subplot(122) # 右边的图 print 'Number of elements: ', len(num_comment_list) #plot_loglog(num_comment_list, u'', u'评论数', u'讨论帖数量') plot_loglog(ax1, num_comment_list, '', 'Number of comments', 'Number of threads') for i in range(len(num_lifecycle_list)): #num_lifecycle_list[i] = int(num_lifecycle_list[i] * 24) num_lifecycle_list[i] = int(num_lifecycle_list[i]) #plot_loglog(num_lifecycle_list, u'', u'生命周期长度', u'讨论帖数量') plot_loglog(ax2, num_lifecycle_list, '', 'Length of lifecycle(days)', 'Number of threads') plt.show()
def main(group_id): # 读取topic id list base_path = '/home/kqc/dataset/douban-group/' path = base_path + '/TopicList-' + group_id + '.txt' topic_id_list = load_id_list(path) topic_id_list = ['21108362'] for topic_id in topic_id_list: path = base_path + group_id + '/' + topic_id + '-info.txt' topic_pubdate = None f = codecs.open(path, 'r', 'utf-8') content = f.readline() seg_list = content.split('[=]') num_comment = int(seg_list[5]) # topic publishing date topic_puddate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S') if num_comment < threshold: continue date_list = [topic_puddate] for line in f: seg_list = line.split('[=]') date = seg_list[4] date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') date_list.append(date) f.close() sorted(date_list, reverse = True) count = len(date_list) print 'topic id:', topic_id print 'Number of comments: %d\n' % (count-1) #plot_popularity(date_list) plot_granularity_popularity(date_list, freq = 'D')
print 'Find missing quote in post: ', post_id print 'Comment content: ', content else: tf.write(sline + '\n') print 'Error in finding quote in post: ', post_id print 'Matched quote: ', m.group() print 'Comment content: ', content else: tf.write(sline + '\n') sf.close() tf.close() if __name__ == '__main__': import codecs import sys writer = codecs.getwriter('utf8') sys.stdout = writer(sys.stdout) # for writing to pipes section_id = 'free' base_path = '/home/kqc/dataset/tianya-forum/' source_path = base_path + 'test/' target_path = '/home/kqc/dataset/tianya-forum-fixquote/' + section_id + '/' post_list_path = base_path + section_id + '-post-list.txt' post_list = load_id_list(post_list_path) post_list = ['3164851'] fix_quote_info(source_path, target_path, post_list)
def main(section_id, base_path): """ 生成动态因素 """ post_list_path = base_path + section_id + '-post-list.txt' topic_list = load_id_list(post_list_path) topic_list = list(set(topic_list)) # 删除重复post id target_base_path = 'data-dynamic/' #topic_list = ['1377621'] # 这里不用post_id替换topic_id的原因是:feature_dict中使用了“topic_id”作为key valid_topic_list = [] # 真正抽取特征的post id列表 for topic_id in topic_list: path = base_path + section_id + '/' + topic_id + '-info.txt' if not os.path.exists(path): continue tpath = target_base_path + section_id + '/' + topic_id + '.txt' # 如果目标输出已经存在,则忽略 if os.path.exists(tpath): continue f = codecs.open(path, 'r', 'utf8') print 'Extracting features from post file: ', path # 记录需要存储的字符串,该字符串可能较大,最大可能几M result_string = "" # 第一行为post本身的信息 line = f.readline().strip() seg_list = line.split('[=]') if len(seg_list) != 8: print 'Error in the first line of topic file: ', path f.close() continue lz = seg_list[2] # LZ id for author reply to, topic_id for comment tree lz_user_name = seg_list[3] # 楼主的用户名 pubdate = seg_list[5] num_comment = int(seg_list[6]) # NOTE:可能该文件中存储的评论数不足此数值,由于之前抓取错误导致 # first line: topic info feature_dict = dict() feature_dict['topic_id'] = topic_id feature_dict['lz'] = lz feature_dict['pubdate'] = pubdate feature_dict['num_comment'] = num_comment #tf.write(str(feature_dict) + '\n') result_string += (str(feature_dict) + '\n') #NOTE: 因为可能在抓取时保存不完整,真正存在的评论数量可能少于第一行所表明的数字 # 关于这一点需要检查,如果不合格,则删除 # build two graphs comment_tree = Graph(directed=True) author_reply = Graph(directed=True) # 构建一个二模网络:两类节点分别是评论(包括原帖)和用户 # 如果comment和author之间相连,则:1)author写了comment(包括原作者写了帖子), # 2)author评论了comment(这个comment包括原帖) # comment的type为False,author的type为True comment_author_bigraph = Graph(directed=False) comment_author_bigraph.add_vertex(topic_id, type=False) comment_author_bigraph.add_vertex(lz, type=True) comment_author_bigraph.add_edge(topic_id, lz) comment_dict = dict() # map comment id to graph index comment_dict[topic_id] = comment_tree.vcount() comment_tree.add_vertex(topic_id, date=pubdate, author=lz, depth=0) author_dict = dict() # map author_id to graph index author_dict[lz] = author_reply.vcount() author_reply.add_vertex(lz) flag = True # 标记该文件是否合乎规范 max_depth = 0 # 用于描述comment tree讨论的激烈程度:根节点为0,处于depth为1的节点贡献是1,依次类推 weighted_depth_sum = 0 current_comment_count = 0 # 记录当前的comment数量 for line in f: # 将所有的feature放入feature_dict,可以不考虑顺序 feature_dict = dict() line = line.strip() seg_list = line.split('[=]') if len(seg_list) != 8: print 'Error in the comment line of topic file: ', path flag = False break cid = seg_list[0] # 评论ID pid = seg_list[3] # 用户的ID uname = seg_list[4] # 用户昵称 pubdate = seg_list[5] # 发表时间 replyto = seg_list[6] # 评论引用 feature_dict['cid'] = cid feature_dict['pid'] = pid feature_dict['pubdate'] = pubdate feature_dict['replyto'] = replyto # 回复的comment的cid comment_dict[cid] = comment_tree.vcount() comment_tree.add_vertex(cid, date=pubdate, author=lz) current_comment_count += 1 feature_dict['current_comment_count'] = current_comment_count comment_author_bigraph.add_vertex(cid, type=False) # if this author has once commented, it should be in author_dict if not pid in author_dict: author_dict[pid] = author_reply.vcount() author_reply.add_vertex(pid) comment_author_bigraph.add_vertex(pid, type=True) # the author-of relationship comment_author_bigraph.add_edge(pid, cid) replyto_pid = '' commenton_cid = '' parent_index = 0 # 在评论树上,该节点的父节点 if replyto == '': commenton_cid = topic_id parent_index = comment_dict[commenton_cid] replyto_pid = lz else: commenton_cid = replyto parent_index = comment_dict[commenton_cid] replyto_pid = comment_tree.vs[parent_index]['author'] comment_tree.add_edge(cid, commenton_cid) comment_author_bigraph.add_edge(pid, commenton_cid) # 为cid节点添加depth属性 index = comment_dict[cid] if comment_tree.vs[parent_index]['depth'] == None: #TODO: 未知错误,举例:3211910 flag = False break else: current_depth = comment_tree.vs[index]['depth'] = comment_tree.vs[parent_index]['depth'] + 1 weighted_depth_sum += current_depth avg_weighted_depth_sum = weighted_depth_sum * 1.0 / current_comment_count if current_depth > max_depth: max_depth = current_depth # 如果是回复自己,则忽略 if pid != replyto_pid: # 如果 pid指向replyto_pid已经有链接,则不考虑再次添加 v1 = author_dict[pid] v2 = author_dict[replyto_pid] if author_reply.get_eid(v1, v2, directed=True, error=False) == -1: author_reply.add_edge(v1, v2) # number of current participating commenters num_authors = author_reply.vcount() feature_dict['num_authors'] = num_authors # write statics in target file mean_degree = sum(author_reply.degree()) * 1.0 / author_reply.vcount() avg_local_transitivity = author_reply.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity clustering_coefficient = author_reply.transitivity_undirected(mode='zero') assortativity = author_reply.assortativity_degree(directed=False) num_componnet = len(author_reply.components(mode=WEAK)) reply_density = author_reply.density(loops=True) # cohesion和adhesion都不合适,因为几乎每个图都有一条度为1的边 #cohesion = author_reply.cohesion(neighbors='ignore') #adhesion = author_reply.adhesion() # Ref: http://igraph.sourceforge.net/doc/python/igraph.Graph-class.html#cohesive_blocks # cohesive_blocks only works on undirected graphs #author_reply_cohesive_block = author_reply.cohesive_blocks() #author_reply_max_cohesions = author_reply_cohesive_block.max_cohesions() feature_dict['mean_degree'] = mean_degree feature_dict['avg_local_transitivity'] = avg_local_transitivity feature_dict['clustering_coefficient'] = clustering_coefficient feature_dict['assortativity'] = assortativity feature_dict['num_componnet'] = num_componnet feature_dict['reply_density'] = reply_density # author-reply graph group cohesiveness #feature_dict['author_reply_max_cohesions'] = author_reply_max_cohesions # dynamic factor from WWW'13, Bao tree_density = comment_tree.density(loops=False) average_path_length = comment_tree.average_path_length(directed=False) # do not consider directed graphs #diffusion_depth = comment_tree.diameter(directed=True) # diffusion depth for a tree, i.e the depth of a tree diffusion_depth = max_depth # comment tree related factors feature_dict['tree_density'] = tree_density feature_dict['diffusion_depth'] = diffusion_depth feature_dict['avg_weighted_depth_sum'] = avg_weighted_depth_sum feature_dict['avg_path_length'] = average_path_length # Wiener index, the average distance between all paris of nodes in a cascade # the comment-author two model network properties ca_mean_degree = sum(comment_author_bigraph.degree()) * 1.0 / comment_author_bigraph.vcount() ca_avg_local_transitivity = comment_author_bigraph.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity ca_clustering_coefficient = comment_author_bigraph.transitivity_undirected(mode='zero') ca_assortativity = comment_author_bigraph.assortativity_degree(directed=False) ca_num_componnet = len(comment_author_bigraph.components(mode=WEAK)) ca_reply_density = comment_author_bigraph.density(loops=True) #comment_author_cohesive_block = comment_author_bigraph.cohesive_blocks() #ca_max_cohesions = comment_author_cohesive_block.max_cohesions() feature_dict['ca_mean_degree'] = ca_mean_degree feature_dict['ca_avg_local_transitivity'] = ca_avg_local_transitivity feature_dict['ca_clustering_coefficient'] = ca_clustering_coefficient feature_dict['ca_assortativity'] = ca_assortativity feature_dict['ca_num_componnet'] = ca_num_componnet feature_dict['ca_reply_density'] = ca_reply_density #feature_dict['ca_max_cohesions'] = ca_max_cohesions # write feature dict to file #tf.write(str(feature_dict) + '\n') result_string += (str(feature_dict) + '\n') # do not consider threads who has more than 1000 comments if current_comment_count >= MAX_COMMENT: break # 如果未达到最大评论数,而且当前的评论数不足第一行所宣称的总评论数,则忽略该post if current_comment_count < MAX_COMMENT and num_comment != current_comment_count: flag = False # print dynamic feature #plot(comment_tree) #plot(author_reply) #ipdb.set_trace() #print author_reply.transitivity_undirected(mode='zero') #print author_reply.transitivity_avglocal_undirected(mode='zero') # the avg of local transitivity #print author_reply.assortativity_degree(False) f.close() if flag: valid_topic_list.append(topic_id) # 输出到文件 print 'Saving: ', tpath tf = codecs.open(tpath, 'w', 'utf8') tf.write(result_string) tf.close() # 保存所有有效的讨论帖id path = target_base_path + section_id + '-post-list-feature.txt' f = codecs.open(path, 'w', 'utf8') for topic_id in valid_topic_list: f.write(topic_id + '\n') f.close()
def main(group_id): base_path = '/home/kqc/dataset/douban-group/' #group_id = 'qiong' topic_path = base_path + 'TopicList-' + group_id + '.txt' topic_list = load_id_list(topic_path) target_base_path = 'data-dynamic/' #topic_list = ['1377621'] for topic_id in topic_list: path = base_path + group_id + '/' + topic_id + '-info.txt' if not os.path.exists(path): continue print 'Reading topic file: ', path tpath = target_base_path + group_id + '/' + topic_id + '.txt' f = open(path, 'r') tf = open(tpath, 'w') # read topic info line = f.readline().strip() seg_list = line.split('[=]') if len(seg_list) < 7: print 'Error in the first line of topic file: ', path f.close() tf.close() continue lz = seg_list[2] # LZ id for author reply to, topic_id for comment tree pubdate = seg_list[4] num_comment = int(seg_list[5]) # first line: topic info feature_dict = dict() feature_dict['topic_id'] = topic_id feature_dict['lz'] = lz feature_dict['pubdate'] = pubdate feature_dict['num_comment'] = num_comment tf.write(str(feature_dict) + '\n') # build two graphs comment_tree = Graph(directed=True) author_reply = Graph(directed=True) # 构建一个二模网络:两类节点分别是评论(包括原帖)和用户 # 如果comment和author之间相连,则:1)author写了comment(包括原作者写了帖子), # 2)author评论了comment(这个comment包括原帖) # comment的type为False,author的type为True comment_author_bigraph = Graph(directed=False) comment_author_bigraph.add_vertex(topic_id, type=False) comment_author_bigraph.add_vertex(lz, type=True) comment_author_bigraph.add_edge(topic_id, lz) comment_dict = dict() # map comment id to graph index comment_dict[topic_id] = comment_tree.vcount() comment_tree.add_vertex(topic_id, date=pubdate, author=lz, depth=0) author_dict = dict() # map author_id to graph index author_dict[lz] = author_reply.vcount() author_reply.add_vertex(lz) max_depth = 0 # 用于描述comment tree讨论的激烈程度:根节点为0,处于depth为1的节点贡献是1,依次类推 weighted_depth_sum = 0 current_comment_count = 0 # 记录当前的comment数量 for line in f: # 将所有的feature放入feature_dict,可以不考虑顺序 feature_dict = dict() line = line.strip() seg_list = line.split('[=]') if len(seg_list) < 7: print 'Error in the comment line of topic file: ', path break cid = seg_list[0] pid = seg_list[3] pubdate = seg_list[4] replyto = seg_list[5] feature_dict['cid'] = cid feature_dict['pid'] = pid feature_dict['pubdate'] = pubdate feature_dict['replyto'] = replyto # 回复的comment的cid comment_dict[cid] = comment_tree.vcount() comment_tree.add_vertex(cid, date=pubdate, author=lz) current_comment_count += 1 feature_dict['current_comment_count'] = current_comment_count comment_author_bigraph.add_vertex(cid, type=False) # if this author has once commented, it should be in author_dict if not pid in author_dict: author_dict[pid] = author_reply.vcount() author_reply.add_vertex(pid) comment_author_bigraph.add_vertex(pid, type=True) # the author-of relationship comment_author_bigraph.add_edge(pid, cid) replyto_pid = '' commenton_cid = '' if replyto == '': commenton_cid = topic_id parent_index = comment_dict[commenton_cid] #comment_tree.add_edge(cid, topic_id) replyto_pid = lz #author_reply.add_edge(pid, lz) else: commenton_cid = replyto #comment_tree.add_edge(cid, replyto) parent_index = comment_dict[commenton_cid] replyto_pid = comment_tree.vs[parent_index]['author'] #author_reply.add_edge(pid, comment_tree.vs[index]['author']) comment_tree.add_edge(cid, commenton_cid) comment_author_bigraph.add_edge(pid, commenton_cid) # 为cid节点添加depth属性 index = comment_dict[cid] current_depth = comment_tree.vs[index]['depth'] = comment_tree.vs[parent_index]['depth'] + 1 weighted_depth_sum += current_depth avg_weighted_depth_sum = weighted_depth_sum * 1.0 / current_comment_count if current_depth > max_depth: max_depth = current_depth # 如果是回复自己,则忽略 if pid != replyto_pid: # 如果 pid指向replyto_pid已经有链接,则不考虑再次添加 v1 = author_dict[pid] v2 = author_dict[replyto_pid] if author_reply.get_eid(v1, v2, directed=True, error=False) == -1: author_reply.add_edge(v1, v2) # number of participating commenters num_authors = author_reply.vcount() feature_dict['num_authors'] = num_authors # write statics in target file mean_degree = sum(author_reply.degree()) * 1.0 / author_reply.vcount() avg_local_transitivity = author_reply.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity clustering_coefficient = author_reply.transitivity_undirected(mode='zero') assortativity = author_reply.assortativity_degree(directed=False) num_componnet = len(author_reply.components(mode=WEAK)) reply_density = author_reply.density(loops=True) # cohesion和adhesion都不合适,因为几乎每个图都有一条度为1的边 #cohesion = author_reply.cohesion(neighbors='ignore') #adhesion = author_reply.adhesion() # Ref: http://igraph.sourceforge.net/doc/python/igraph.Graph-class.html#cohesive_blocks # cohesive_blocks only works on undirected graphs #author_reply_cohesive_block = author_reply.cohesive_blocks() #author_reply_max_cohesions = author_reply_cohesive_block.max_cohesions() feature_dict['mean_degree'] = mean_degree feature_dict['avg_local_transitivity'] = avg_local_transitivity feature_dict['clustering_coefficient'] = clustering_coefficient feature_dict['assortativity'] = assortativity feature_dict['num_componnet'] = num_componnet feature_dict['reply_density'] = reply_density # author-reply graph group cohesiveness #feature_dict['author_reply_max_cohesions'] = author_reply_max_cohesions # dynamic factor from WWW'13, Bao tree_density = comment_tree.density(loops=False) average_path_length = comment_tree.average_path_length(directed=False) # do not consider directed graphs #diffusion_depth = comment_tree.diameter(directed=True) # diffusion depth for a tree, i.e the depth of a tree diffusion_depth = max_depth # comment tree related factors feature_dict['tree_density'] = tree_density feature_dict['diffusion_depth'] = diffusion_depth feature_dict['avg_weighted_depth_sum'] = avg_weighted_depth_sum feature_dict['avg_path_length'] = average_path_length # Wiener index, the average distance between all paris of nodes in a cascade # the comment-author two model network properties ca_mean_degree = sum(comment_author_bigraph.degree()) * 1.0 / comment_author_bigraph.vcount() ca_avg_local_transitivity = comment_author_bigraph.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity ca_clustering_coefficient = comment_author_bigraph.transitivity_undirected(mode='zero') ca_assortativity = comment_author_bigraph.assortativity_degree(directed=False) ca_num_componnet = len(comment_author_bigraph.components(mode=WEAK)) ca_reply_density = comment_author_bigraph.density(loops=True) #comment_author_cohesive_block = comment_author_bigraph.cohesive_blocks() #ca_max_cohesions = comment_author_cohesive_block.max_cohesions() feature_dict['ca_mean_degree'] = ca_mean_degree feature_dict['ca_avg_local_transitivity'] = ca_avg_local_transitivity feature_dict['ca_clustering_coefficient'] = ca_clustering_coefficient feature_dict['ca_assortativity'] = ca_assortativity feature_dict['ca_num_componnet'] = ca_num_componnet feature_dict['ca_reply_density'] = ca_reply_density #feature_dict['ca_max_cohesions'] = ca_max_cohesions # write feature dict to file tf.write(str(feature_dict) + '\n') # do not consider threads who has more than 1000 comments if current_comment_count >= MAX_COMMENT: break # print dynamic feature #plot(comment_tree) #plot(author_reply) #ipdb.set_trace() #print author_reply.transitivity_undirected(mode='zero') #print author_reply.transitivity_avglocal_undirected(mode='zero') # the avg of local transitivity #print author_reply.assortativity_degree(False) f.close() tf.close()
def main(group_id): topiclist_path = 'data-dynamic/TopicList-' + group_id + '-filtered.txt' topic_list = load_id_list(topiclist_path) print 'Number of total topics loaded: ', len(topic_list) # set the pre-computed popularity level # 未来的最大评论数可能超过pop_level的最大值 # 注意:这里将最小的popularity值,即0,忽略了 #pop_level = [8, 13, 23, 43, float('inf')] # group: zhuangb pop_level = [25, 50, float('inf')] # group: zhuangb #pop_level = [25, 50, float('inf')] # group: buybook #pop_level = [30, float('inf')] # group: buybook # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测 # target_date 的含义为:预测在 target_date 处的评论数量 # 以上两个参数可以调节 # 设置采样的间隔 gaptime = timedelta(hours=5) prediction_date = timedelta(hours=10*5) response_time = timedelta(hours=50) target_date = prediction_date + response_time # 计算每个topic在prediction_date前会有多少个interval num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds()) print 'Number of features: ', num_feature alpha = 1.5 percentage_threshold = 0.7 print 'Generating training and test dataset...' dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \ topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold) # 保存那些经过筛选的topic id #save_filtered_topics(group_id, dataset) #print 'Ploting factor propagation' #factor_propagation_plot(dataset, num_feature) #topic_propagation_plot(dataset, num_feature) #return # 调整所有帖子的顺序 # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样 #shuffle(dataset) print 'Down-sampling the datasets...' dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \ comment_count_dataset, Bao_dataset, category_count_list) total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1] #num_level = len(pop_level) #raw_input() #import ipdb #ipdb.set_trace() print 'The proposed model:' k = 3 num_level = 2 num_factor = len(train_set[0][1][1]) print 'Classify test instances...' y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list = classify(train_set, test_set, k, num_level) # evaluate results print 'Number of give-ups: ', len(give_up_list) classification_evaluation(y_true, y_pred) level_MSE_evaluation(y_true, y_pred) #save_predictions(prediction_list, y_pred, factor_name = 'num_authors') #save_predictions(prediction_list, y_true, factor_name = 'all') comment_RSE_evaluation(comment_true, comment_pred) #print 'The class prior:', prior_score from svm_model import svm_model print 'Building a svm model...' y_true, y_pred = svm_model(train_set, test_set) classification_evaluation(y_true, y_pred) # 查看对于不同的factor,它们在不同的ratio上的预测结果 #from utils import ratio_accuracy_distribution_plot #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density') # S-H model print '\nThe S-H model:' baseline_train_set = comment_count_dataset[:train_cnt] baseline_test_set = comment_count_dataset[train_cnt:] y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha) # drop some intances with cat = 0 comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) # level wise classification classification_evaluation(y_true, y_pred) level_MSE_evaluation(y_true, y_pred) print '\nML model:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print '\nMLR model:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print '\nkNN method:' k = 1 y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, k, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) # level wise classification classification_evaluation(y_true, y_pred) print "\nBao's method:" Bao_train_set = Bao_dataset[:train_cnt] Bao_test_set = Bao_dataset[train_cnt:] y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred)
IPW_acc_list.append(IPW_acc) return IPW_acc_list if __name__ == "__main__": import sys group_id = sys.argv[1] topiclist_path = "data-dynamic/TopicList-" + group_id + "-filtered.txt" # for douban-group # topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset print "Reading topic list from file:", topiclist_path topic_list = load_id_list(topiclist_path) print "Number of total topics loaded: ", len(topic_list) # for test # topic_list = topic_list[:50] threshold_p = 0.7 prediction_date_tr = timedelta(hours=50) response_time_delta = timedelta(hours=25) # cvk_list = [1, 3, 5, 7] cvk_list = [9] for cvk in cvk_list: print "CV for k: ", cvk IPW_acc_list = select_k(group_id, topic_list, threshold_p, prediction_date_tr, response_time_delta, cvk) print IPW_acc_list print "avg_IPW_acc: ", sum(IPW_acc_list) * 1.0 / len(IPW_acc_list)