def cal_co_occurence_matrix_ochiai(feature_input_path, high_freq_word_intput_path, output_path): try: # 获取所有特征列表,存储在集合中 _, file_path_list = get_all_file(feature_input_path) file_path_list.sort() feature_set_list = [ ] # 每一篇文章的mesh列表是一个feature,用它创建一个feature_set,所有的特征存储在feature_set_list for file_path in file_path_list: infile = open(file_path, "r") for line in infile: feature = line.rstrip('\n').split(',') feature_set = set(feature) feature_set_list.append(feature_set) infile.close() # 获取高频词 high_freq_word_list = [] word_counter = {} infile = open(high_freq_word_intput_path, "r") for line in infile: line = line.rstrip('\n').split(',') word = line[0] word_cnt = int(line[1]) high_freq_word_list.append(word) word_counter[word] = word_cnt infile.close() # 构建共词矩阵 co_occurence_matrix = [] for u in high_freq_word_list: vector = [] for v in high_freq_word_list: ochiai_index = ochiai_similarity(u, v, feature_set_list, word_counter) vector.append(ochiai_index) co_occurence_matrix.append(vector) # 写回文件 outfile = open(output_path, "w") for row in co_occurence_matrix: line = "" for item in row: line = line + str(item) + ' ' line = line.rstrip(' ') outfile.write(line + '\n') outfile.close() except Exception, e: print traceback.print_exc()
def cal_co_author_ratio_and_nums_of_author_per_article(feature_list_path, output_path_ratio, output_path_nums): try: co_author_ratio_list = [] nums_of_author_per_article_list = [] _, file_list = get_all_file(feature_list_path) file_list.sort() for file_path in file_list: infile = open(file_path, "r") article_only_one_author_num = 0 total_author_num = 0 feature_list = [] for line in infile: feature = line.rstrip('\n').split(',') feature_list.append(feature) author_num_per_article = len(feature) total_author_num += author_num_per_article if author_num_per_article == 1: article_only_one_author_num += 1 total_article_num = len(feature_list) infile.close() co_author_ratio = 1 - 1.0 * article_only_one_author_num / total_article_num num_of_author_per_article = 1.0 * total_author_num / total_article_num co_author_ratio_list.append(co_author_ratio) nums_of_author_per_article_list.append(num_of_author_per_article) outfile = open(output_path_ratio, "w") for ratio in co_author_ratio_list: outfile.write(str(ratio) + '\n') outfile.close() outfile = open(output_path_nums, "w") for nums in nums_of_author_per_article_list: outfile.write(str(nums) + '\n') outfile.close() except Exception, e: print traceback.print_exc()
def word_count(input_path, output_path): try: _, file_list = get_all_file(input_path) file_list.sort() word_counter = {} for file_path in file_list: infile = open(file_path, "r") for line in infile: line = line.rstrip('\n').split(',') for word in line: if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 infile.close() number_of_one = 0 for key in word_counter: if word_counter[key] == 1: number_of_one += 1 sorted_list = sorted(word_counter.items(), key=lambda item: item[1], reverse=True) outfile = open(output_path, "w") for aa in sorted_list: line = str(aa[0]) + ',' + str(aa[1]) line = line.lstrip(' ') outfile.write(line + '\n') outfile.write("number of 1 : " + str(number_of_one) + '\n') outfile.close() except Exception, e: print traceback.print_exc()
""" def get_total_mesh_number(input_path, cur_number): try: total = 0 infile = open(input_path, "r") for line in infile: line = line.rstrip('\n').split(',') total += len(line) infile.close() return cur_number + total except Exception, e: print traceback.print_exc() if __name__ == '__main__': # get xml file from 2006 to 2015 ROOT_PATH = "../../output/feature_list/" #ROOT_PATH = "../../output/feature_author_list/" _, file_list = get_all_file(ROOT_PATH) file_list.sort() cur_total = 0 for file_path in file_list: cur_total = get_total_mesh_number(file_path, cur_total) print "total mesh number is ", cur_total