示例#1
0
def cal_co_occurence_matrix_ochiai(feature_input_path,
                                   high_freq_word_intput_path, output_path):
    try:

        # 获取所有特征列表,存储在集合中
        _, file_path_list = get_all_file(feature_input_path)
        file_path_list.sort()

        feature_set_list = [
        ]  # 每一篇文章的mesh列表是一个feature,用它创建一个feature_set,所有的特征存储在feature_set_list
        for file_path in file_path_list:

            infile = open(file_path, "r")

            for line in infile:
                feature = line.rstrip('\n').split(',')
                feature_set = set(feature)
                feature_set_list.append(feature_set)

            infile.close()

        # 获取高频词
        high_freq_word_list = []
        word_counter = {}
        infile = open(high_freq_word_intput_path, "r")
        for line in infile:
            line = line.rstrip('\n').split(',')

            word = line[0]
            word_cnt = int(line[1])

            high_freq_word_list.append(word)
            word_counter[word] = word_cnt

        infile.close()

        # 构建共词矩阵
        co_occurence_matrix = []
        for u in high_freq_word_list:
            vector = []
            for v in high_freq_word_list:
                ochiai_index = ochiai_similarity(u, v, feature_set_list,
                                                 word_counter)
                vector.append(ochiai_index)
            co_occurence_matrix.append(vector)

        # 写回文件
        outfile = open(output_path, "w")
        for row in co_occurence_matrix:

            line = ""
            for item in row:
                line = line + str(item) + ' '
            line = line.rstrip(' ')
            outfile.write(line + '\n')

        outfile.close()

    except Exception, e:
        print traceback.print_exc()
示例#2
0
def cal_co_author_ratio_and_nums_of_author_per_article(feature_list_path,
                                                       output_path_ratio,
                                                       output_path_nums):
    try:

        co_author_ratio_list = []
        nums_of_author_per_article_list = []

        _, file_list = get_all_file(feature_list_path)
        file_list.sort()

        for file_path in file_list:
            infile = open(file_path, "r")

            article_only_one_author_num = 0
            total_author_num = 0
            feature_list = []
            for line in infile:
                feature = line.rstrip('\n').split(',')
                feature_list.append(feature)

                author_num_per_article = len(feature)
                total_author_num += author_num_per_article

                if author_num_per_article == 1:
                    article_only_one_author_num += 1

            total_article_num = len(feature_list)

            infile.close()

            co_author_ratio = 1 - 1.0 * article_only_one_author_num / total_article_num
            num_of_author_per_article = 1.0 * total_author_num / total_article_num

            co_author_ratio_list.append(co_author_ratio)
            nums_of_author_per_article_list.append(num_of_author_per_article)

        outfile = open(output_path_ratio, "w")
        for ratio in co_author_ratio_list:
            outfile.write(str(ratio) + '\n')
        outfile.close()

        outfile = open(output_path_nums, "w")
        for nums in nums_of_author_per_article_list:
            outfile.write(str(nums) + '\n')
        outfile.close()

    except Exception, e:
        print traceback.print_exc()
示例#3
0
def word_count(input_path, output_path):
    try:

        _, file_list = get_all_file(input_path)
        file_list.sort()

        word_counter = {}

        for file_path in file_list:
            infile = open(file_path, "r")

            for line in infile:
                line = line.rstrip('\n').split(',')
                for word in line:
                    if word in word_counter:
                        word_counter[word] += 1
                    else:
                        word_counter[word] = 1

            infile.close()

        number_of_one = 0
        for key in word_counter:
            if word_counter[key] == 1:
                number_of_one += 1

        sorted_list = sorted(word_counter.items(),
                             key=lambda item: item[1],
                             reverse=True)

        outfile = open(output_path, "w")
        for aa in sorted_list:
            line = str(aa[0]) + ',' + str(aa[1])
            line = line.lstrip(' ')
            outfile.write(line + '\n')
        outfile.write("number of 1 : " + str(number_of_one) + '\n')
        outfile.close()

    except Exception, e:
        print traceback.print_exc()
示例#4
0
"""


def get_total_mesh_number(input_path, cur_number):
    try:

        total = 0
        infile = open(input_path, "r")
        for line in infile:
            line = line.rstrip('\n').split(',')
            total += len(line)
        infile.close()

        return cur_number + total
    except Exception, e:
        print traceback.print_exc()


if __name__ == '__main__':
    # get xml file from 2006 to 2015

    ROOT_PATH = "../../output/feature_list/"
    #ROOT_PATH = "../../output/feature_author_list/"
    _, file_list = get_all_file(ROOT_PATH)
    file_list.sort()

    cur_total = 0
    for file_path in file_list:
        cur_total = get_total_mesh_number(file_path, cur_total)

    print "total mesh number is ", cur_total