Пример #1
0
def text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    """
    查询分类
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    """

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, "r")
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, "r")
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(query_pattern)):
        this_result = query(query_pattern[i], search_texts, word_weight_dict)
        result.append(" ".join([str(x) for x in this_result]))

    quick_write_list_to_text(result, write_filename)
Пример #2
0
def text_classify(read_filename1, read_filename2, read_filename3,
                  write_filename):
    '''
    查询分类
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    '''

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(query_pattern)):
        this_result = query(query_pattern[i], search_texts, word_weight_dict)
        result.append(" ".join([str(x) for x in this_result]))

    quick_write_list_to_text(result, write_filename)
Пример #3
0
def compute_similarity(pattern_list, read_filename, word_weight_dict):

    search_texts = []
    get_text_to_single_list(search_texts, read_filename)
    
    query_result_list = []
    for i in range(len(pattern_list)):
        query_result_list.append(query(pattern_list[i], search_texts, word_weight_dict))
    
    similarity_matrix = np.zeros([len(pattern_list), len(pattern_list)])
    tag = []
    for i in range(len(pattern_list)):
        tag.append(0)
        for j in range(i, len(pattern_list)):
            '''
            计算每一个频繁项集查询匹配到的文本集合,用查询文本集合之间的Jacard相似度衡量频繁项集之间的相似度
            见TextQuery.py
            '''
            numerator = len(set(query_result_list[i]) & set(query_result_list[j]))
            denominator = len(set(query_result_list[i]) | set(query_result_list[j]))
            
            similarity_matrix[i, j] = np.true_divide(numerator, denominator)
            similarity_matrix[j, i] = similarity_matrix[i, j]
    
    '''
    分部划分以确定聚类中心个数
    '''       
    class_partion = []        
    for i in range(len(pattern_list)):
        if tag[i] == 0:
            temp_class_partion = []
            for j in range(i, len(pattern_list)):
                if similarity_matrix[i, j] > 0.2:
                    temp_class_partion.append(j)
                    tag[j] = 1
            class_partion.append(temp_class_partion)
    
    partion_length = []
    for each in class_partion:
        partion_length.append(len(each))
    
    # 按长度降序排序
    cl = zip(class_partion, partion_length)
    cl = sorted(cl, key = itemgetter(1), reverse = True)
    
    class_partion = []
    partion_length = []
    
    for each in cl:
        class_partion.append(each[0])
        partion_length.append(each[1])
    
    length_sum = np.sum(partion_length)
    temp_sum = 0
    cluster_number = 0
    for i in range(len(partion_length)):
        temp_sum += partion_length[i]
        cluster_number += 1
        
        #选取所有频繁项集数量的75%,一刀切,前面的部分的划分数就是聚类数目
        if np.true_divide(temp_sum, length_sum) > 0.75:
            break

    class_partion_to_string = []
    for i in range(cluster_number):
        class_partion_to_string.append(" ".join([str(x) for x in class_partion[i]]))
        
    print cluster_number
    
    query_result_list_string = []
    for each in query_result_list:
        query_result_list_string.append(" ".join([str(x) for x in each]))       
    
    # if possible  
    #quick_write_list_to_text(class_partion_to_string, 'D:/partion2.txt')
    
    return similarity_matrix, cluster_number
Пример #4
0
def compute_similarity(pattern_list, read_filename, word_weight_dict):

    search_texts = []
    get_text_to_single_list(search_texts, read_filename)

    query_result_list = []
    for i in range(len(pattern_list)):
        query_result_list.append(
            query(pattern_list[i], search_texts, word_weight_dict))

    similarity_matrix = np.zeros([len(pattern_list), len(pattern_list)])
    tag = []
    for i in range(len(pattern_list)):
        tag.append(0)
        for j in range(i, len(pattern_list)):
            '''
            计算每一个频繁项集查询匹配到的文本集合,用查询文本集合之间的Jacard相似度衡量频繁项集之间的相似度
            见TextQuery.py
            '''
            numerator = len(
                set(query_result_list[i]) & set(query_result_list[j]))
            denominator = len(
                set(query_result_list[i]) | set(query_result_list[j]))

            similarity_matrix[i, j] = np.true_divide(numerator, denominator)
            similarity_matrix[j, i] = similarity_matrix[i, j]
    '''
    分部划分以确定聚类中心个数
    '''
    class_partion = []
    for i in range(len(pattern_list)):
        if tag[i] == 0:
            temp_class_partion = []
            for j in range(i, len(pattern_list)):
                if similarity_matrix[i, j] > 0.2:
                    temp_class_partion.append(j)
                    tag[j] = 1
            class_partion.append(temp_class_partion)

    partion_length = []
    for each in class_partion:
        partion_length.append(len(each))

    # 按长度降序排序
    cl = zip(class_partion, partion_length)
    cl = sorted(cl, key=itemgetter(1), reverse=True)

    class_partion = []
    partion_length = []

    for each in cl:
        class_partion.append(each[0])
        partion_length.append(each[1])

    length_sum = np.sum(partion_length)
    temp_sum = 0
    cluster_number = 0
    for i in range(len(partion_length)):
        temp_sum += partion_length[i]
        cluster_number += 1

        #选取所有频繁项集数量的75%,一刀切,前面的部分的划分数就是聚类数目
        if np.true_divide(temp_sum, length_sum) > 0.75:
            break

    class_partion_to_string = []
    for i in range(cluster_number):
        class_partion_to_string.append(" ".join(
            [str(x) for x in class_partion[i]]))

    print cluster_number

    query_result_list_string = []
    for each in query_result_list:
        query_result_list_string.append(" ".join([str(x) for x in each]))

    # if possible
    #quick_write_list_to_text(class_partion_to_string, 'D:/partion2.txt')

    return similarity_matrix, cluster_number