Пример #1
0
def text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    """
    查询分类
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    """

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, "r")
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, "r")
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(query_pattern)):
        this_result = query(query_pattern[i], search_texts, word_weight_dict)
        result.append(" ".join([str(x) for x in this_result]))

    quick_write_list_to_text(result, write_filename)
Пример #2
0
def get_key_words(read_filename, write_filename1, write_filename2):
    '''
    使用结巴分词获取关键词
    :param read_filename:
    :param write_filename1:
    :param write_filename2:
    '''
    
    each_weibo_fenci = []        
    get_text_to_complex_list(each_weibo_fenci, read_filename, 0)
        
    key_words = []
    all_key_words =  []
    for row in range(len(each_weibo_fenci)):
        word_entity = []

        for each in each_weibo_fenci[row]:
            word_entity.append(each.split('/')[0])

        tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
        key_words.append(" ".join(tags))
            
        for word in " ".join(tags).split():
            if word not in all_key_words:
                all_key_words.append(word)
        
    quick_write_list_to_text(key_words, write_filename1)
    quick_write_list_to_text(all_key_words, write_filename2)
Пример #3
0
def get_key_words(read_filename, write_filename1, write_filename2):
    '''
    使用结巴分词获取关键词
    :param read_filename:
    :param write_filename1:
    :param write_filename2:
    '''

    each_weibo_fenci = []
    get_text_to_complex_list(each_weibo_fenci, read_filename, 0)

    key_words = []
    all_key_words = []
    for row in range(len(each_weibo_fenci)):
        word_entity = []

        for each in each_weibo_fenci[row]:
            word_entity.append(each.split('/')[0])

        tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
        key_words.append(" ".join(tags))

        for word in " ".join(tags).split():
            if word not in all_key_words:
                all_key_words.append(word)

    quick_write_list_to_text(key_words, write_filename1)
    quick_write_list_to_text(all_key_words, write_filename2)
Пример #4
0
def text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    
    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)
    
    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()
    
    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()  
    f1.close()
    
    result = []
    for i in range(len(search_texts)):
        result.append([])
        
    for i in range(len(query_pattern)):
        this_result = query2(query_pattern[i], search_texts, word_weight_dict)
        result[this_result].append(str(i))
    
    result_to_string = []
    for each in result:
        result_to_string.append(" ".join(each))
    
    quick_write_list_to_text(result_to_string, write_filename)
Пример #5
0
def text_classify(read_filename1, read_filename2, read_filename3,
                  write_filename):
    '''
    查询分类
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    '''

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(query_pattern)):
        this_result = query(query_pattern[i], search_texts, word_weight_dict)
        result.append(" ".join([str(x) for x in this_result]))

    quick_write_list_to_text(result, write_filename)
Пример #6
0
def text_classify(read_filename1, read_filename2, read_filename3,
                  write_filename):

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(search_texts)):
        result.append([])

    for i in range(len(query_pattern)):
        this_result = query2(query_pattern[i], search_texts, word_weight_dict)
        result[this_result].append(str(i))

    result_to_string = []
    for each in result:
        result_to_string.append(" ".join(each))

    quick_write_list_to_text(result_to_string, write_filename)
def get_key_words(read_directory, write_directory1, write_directory2):
    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    for i in range(file_number):
        each_weibo_fenci = []
        get_text_to_complex_list(each_weibo_fenci,
                                 read_directory + '/' + str(i + 1) + '.txt', 2)

        key_words = []
        all_key_words = []
        for row in range(len(each_weibo_fenci)):
            word_entity = []

            for each in each_weibo_fenci[row]:
                word_entity.append(each.split('/')[0])

            tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
            key_words.append(" ".join(tags))

            for word in " ".join(tags).split():
                if word not in all_key_words:
                    all_key_words.append(word)

        quick_write_list_to_text(key_words,
                                 write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_key_words,
                                 write_directory2 + '/' + str(i + 1) + '.txt')
Пример #8
0
def get_key_words(read_directory, write_directory1, write_directory2):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    '''

    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    for i in range(file_number):
        each_weibo_fenci = []        
        get_text_to_complex_list(each_weibo_fenci, read_directory + '/' + str(i + 1) + '.txt', 0)
        
        key_words = []
        all_key_words =  []
        for row in range(len(each_weibo_fenci)):
            word_entity = []

            for each in each_weibo_fenci[row]:
                word_entity.append(each.split('/')[0])

            tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
            key_words.append(" ".join(tags))
            
            for word in " ".join(tags).split():
                if word not in all_key_words:
                    all_key_words.append(word)
        
        quick_write_list_to_text(key_words, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_key_words, write_directory2 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
def select_top_N_words(read_directory1, read_directory2, write_directory):
    N = 1000
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        each_word_tf = each_word_tf[1:]  # 列表,内层2个
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
        
        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in key_words:
                select_word.append(word_entity)
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))  
            else:
                select_word.append(word_entity)
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break
        
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Пример #10
0
def online_lda(read_directory1, read_directory2, write_directory1, write_directory2, write_directory3):
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    latent_topic_number = 50
    
    for i in range(file_number):
        each_weibo_fenci = []
        all_weibo_word = []
        
        get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            all_weibo_word.append([line.strip().split()[0]])
            line = f.readline()  
        f.close()
        
        dictionary = corpora.Dictionary(all_weibo_word)
        tf_corpus = [dictionary.doc2bow(text) for text in each_weibo_fenci]
        
        tf_corpus_to_string = []
        for each in tf_corpus:
            ss = [str(x) for x in each]
            tf_corpus_to_string.append("+".join(ss))
        
        lda = models.ldamodel.LdaModel(tf_corpus, num_topics=latent_topic_number)
        
        #获取文档-潜在主题分布矩阵
        THETA = []
        for j in range(len(tf_corpus)):
            this_line = np.zeros(latent_topic_number)
            for each1 in lda[tf_corpus[j]]:
                #each1是一个元组(topic_id, probability)
                this_line[each1[0]] = each1[1]
            
            THETA.append(" ".join([str(x) for x in this_line]))
        
        #获取潜在主题-词汇分布矩阵
        PHAI = []
        raw_topics = lda.show_topics(topics=latent_topic_number, formatted=False)
        for j in range(latent_topic_number):
            this_line = np.zeros(len(all_weibo_word))
            for each2 in raw_topics[j]:
                #each1是一个元组(probability, (str)topic_id)
                this_line[int(each2[1])] = each2[0]
            
            PHAI.append(" ".join([str(x) for x in this_line]))
            
        
        quick_write_list_to_text(tf_corpus_to_string, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(THETA, write_directory2 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(PHAI, write_directory3 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Пример #11
0
def count_word_tf(read_directory1, read_directory2, write_directory):
    '''
    计算每片数据的所有词汇的词频
    :param read_directory1: 文本文件目录
    :param read_directory2: 所有词汇文件目录
    :param write_directory: 写入目录
    '''

    #文件总数
    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    for i in range(file_number):
        #每条文本的分词结果
        each_text_segment = []
        #该数据片中的所有数据
        all_text_word = []

        get_text_to_complex_list(each_text_segment,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 0)
        get_text_to_single_list(all_text_word,
                                read_directory2 + '/' + str(i + 1) + '.txt')

        tf_dict = {}  #词频TF字典
        for key in all_text_word:
            tf_dict[key] = 0

        for row in range(len(each_text_segment)):
            for j in range(len(each_text_segment[row])):
                try:
                    tf_dict[each_text_segment[row][j]] += 1
                except KeyError:
                    tf_dict[each_text_segment[row][j]] = 0

        #词频列表
        value_list = []
        for key in all_text_word:
            value_list.append(tf_dict[key])

        # 按词频降序排序
        va = zip(all_text_word, value_list)
        va = sorted(va, key=itemgetter(1), reverse=True)

        result_all = ['-Word- -TF-']
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))

        #写入文件
        quick_write_list_to_text(result_all,
                                 write_directory + '/' + str(i + 1) + '.txt')
Пример #12
0
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2,
                                  write_directory):
    '''
    微博文本的向量空间构造,值为TF
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''

    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    for i in range(file_number):
        each_weibo_fenci = []
        all_weibo_fenci = []

        get_text_to_complex_list(each_weibo_fenci,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 2)
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            all_weibo_fenci.append(line.strip().split()[0])
            line = f.readline()
        f.close()

        result = []

        for row in range(len(each_weibo_fenci)):

            tf_dict = {}  # 词频TF字典
            for key in all_weibo_fenci:
                tf_dict[key] = 0

            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0

            this_line = []
            for key in all_weibo_fenci:
                this_line.append(str(tf_dict[key]))

            #每一行合并为字符串,方便写入
            result.append(" ".join(this_line))

        quick_write_list_to_text(result,
                                 write_directory + '/' + str(i + 1) + '.txt')

    print "VSM Complete!!!"
Пример #13
0
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2, write_directory):
    '''
    微博文本的向量空间构造,值为TF
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_weibo_fenci = [] 
        all_weibo_fenci = []
        
        get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            all_weibo_fenci.append(line.strip().split()[0])
            line = f.readline()  
        f.close()
        
        result = []
        
        for row in range(len(each_weibo_fenci)):
            
            tf_dict = {}  # 词频TF字典
            for key in all_weibo_fenci:
                tf_dict[key] = 0
            
            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0
            
            this_line = []
            for key in all_weibo_fenci:
                this_line.append(str(tf_dict[key]))
            
            #每一行合并为字符串,方便写入
            result.append(" ".join(this_line))
        
        quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
    
    print "VSM Complete!!!"
def count_word_tf(read_directory1, read_directory2, write_directory):
    '''
    计算每片数据的所有词汇的词频
    :param read_directory1: 文本文件目录
    :param read_directory2: 所有词汇文件目录
    :param write_directory: 写入目录
    '''
    
    #文件总数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        #每条文本的分词结果
        each_text_segment = [] 
        #该数据片中的所有数据
        all_text_word = []
        
        get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        get_text_to_single_list(all_text_word, read_directory2 + '/'+ str(i + 1) + '.txt')
        
        tf_dict = {}  #词频TF字典
        for key in all_text_word:
            tf_dict[key] = 0
            
        for row in range(len(each_text_segment)):
            for j in range(len(each_text_segment[row])):
                try:
                    tf_dict[each_text_segment[row][j]] += 1
                except KeyError:
                    tf_dict[each_text_segment[row][j]] = 0
        
        #词频列表
        value_list = []
        for key in all_text_word:
            value_list.append(tf_dict[key])
        
        # 按词频降序排序
        va = zip(all_text_word, value_list)
        va = sorted(va, key = itemgetter(1), reverse = True)    
        
        result_all = ['-Word- -TF-']
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))
        
        #写入文件
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Пример #15
0
def batch_count_tf(read_directory1, read_directory2, write_directory):
    '''
    
    :param read_directory1:
    :param read_directory2:
    :param write_directory:
    '''

    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_weibo_fenci = [] 
        all_weibo_fenci = []
        
        get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        get_text_to_single_list(all_weibo_fenci, read_directory2 + '/' + str(i + 1) + '.txt')
        
        tf_dict = {}  #词频TF字典
        for key in all_weibo_fenci:
            tf_dict[key] = 0
            
        for row in range(len(each_weibo_fenci)):
            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j]] = 0
        
        #词频列表
        value_list = []
        for key in all_weibo_fenci:
            value_list.append(tf_dict[key])
        
        # 按词频降序排序
        va = zip(all_weibo_fenci, value_list)
        va = sorted(va, key = itemgetter(1), reverse = True)    
        
        result_all = []
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Пример #16
0
def em_evaluate(read_filename1, read_filename2, write_directory):

    # string类型二维列表
    classification_result = []
    get_text_to_complex_list(classification_result, read_filename1, 0)

    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename2)

    #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5...
    reflect_tag = [['7'], ['1'], ['5'], ['4'], ['3', '2'], ['1'], ['1'],
                   ['1', '2']]

    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(reflect_tag)):
        real_cluster_partion = []
        for j in range(len(real_tag)):
            if real_tag[j] in reflect_tag[i]:
                real_cluster_partion.append(str(j))

        correct = len(
            set(classification_result[i]) & set(real_cluster_partion))
        this_precision = np.true_divide(correct,
                                        len(set(classification_result[i])))
        this_recall = np.true_divide(correct, len(set(real_cluster_partion)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall,
                                       (this_precision + this_recall))

        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))

    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list,
                             write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Пример #17
0
def classification_evaluate(read_filename1, read_filename2, write_directory):

    # string类型二维列表
    classification_result = []
    get_text_to_complex_list(classification_result, read_filename1, 0)

    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename2)

    # 需要手动录入
    class_tag = ['5', '6', '3', '8', '2', '4', '1', '7']

    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(class_tag)):
        real_classification = []
        for j in range(len(real_tag)):
            if real_tag[j] == class_tag[i]:
                real_classification.append(str(j))

        correct = len(set(classification_result[i]) & set(real_classification))
        this_precision = np.true_divide(correct,
                                        len(set(classification_result[i])))
        this_recall = np.true_divide(correct, len(set(real_classification)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall,
                                       (this_precision + this_recall))

        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))

    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list,
                             write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Пример #18
0
def classification_evaluate(read_filename1, read_filename2, write_directory):
    
    # string类型二维列表
    classification_result = []
    get_text_to_complex_list(classification_result, read_filename1, 0)
    
    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename2)
    
    # 需要手动录入
    class_tag = ['2', '3', '6', '1', '5', '7', '4']
    class_tag2 = ['2', '3', '8', '1', '5', '7', '4']
    
    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(class_tag)):
        real_classification = []
        for j in range(len(real_tag)):
            # 检索6和8为一类
            if real_tag[j] == class_tag[i] or real_tag[j] == class_tag2[i]:
                real_classification.append(str(j))
        
        correct = len(set(classification_result[i]) & set(real_classification))
        this_precision = np.true_divide(correct, len(set(classification_result[i])))
        this_recall = np.true_divide(correct, len(set(real_classification)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall))
        
        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))
    
    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list, write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Пример #19
0
def kmeans_evaluate(read_filename1, read_filename2, write_directory):
    
    # string类型二维列表
    classification_result = []
    get_text_to_complex_list(classification_result, read_filename1, 0)
    
    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename2)
    
    #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5...
    reflect_tag = [['1'], ['4'], ['5'], ['7'], ['6', '8'], ['2'], ['3'], ['x']]
    
    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(reflect_tag)):
        real_cluster_partion = []
        for j in range(len(real_tag)):
            if real_tag[j] in reflect_tag[i]:
                real_cluster_partion.append(str(j))
        
        correct = len(set(classification_result[i]) & set(real_cluster_partion))
        this_precision = np.true_divide(correct, len(set(classification_result[i])))
        this_recall = np.true_divide(correct, len(set(real_cluster_partion)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall))
        
        print this_precision, this_recall, this_fmeasure
        
        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))
    
    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list, write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Пример #20
0
def sample_vsm(read_filename1, read_filename2, write_filename):
    
    weibo_content = []
    all_word_list = []
    
    select_number = 1000
    
    get_text_to_complex_list(weibo_content, read_filename1, 0)
    
    f = open(read_filename2)
    line = f.readline()
    while line:
        all_word_list.append(line.strip().split()[0])
        line = f.readline()  
    f.close()
    
    all_word_list = all_word_list[0 : select_number]
    
    vsm = []
        
    for row in range(len(weibo_content)):
            
        tf_dict = {}  # 词频TF字典
        for key in all_word_list:
            tf_dict[key] = 0
            
        for j in range(len(weibo_content[row])):
            try:
                tf_dict[weibo_content[row][j].split('/')[0]] += 1
            except KeyError:
                tf_dict[weibo_content[row][j].split('/')[0]] = 0
            
        this_line = []
        for key in all_word_list:
            this_line.append(str(tf_dict[key]))
            
        #每一行合并为字符串,方便写入
        vsm.append(" ".join(this_line))
        
    quick_write_list_to_text(vsm, write_filename)
Пример #21
0
def sample_vsm(read_filename1, read_filename2, write_filename):

    weibo_content = []
    all_word_list = []

    select_number = 1000

    get_text_to_complex_list(weibo_content, read_filename1, 0)

    f = open(read_filename2)
    line = f.readline()
    while line:
        all_word_list.append(line.strip().split()[0])
        line = f.readline()
    f.close()

    all_word_list = all_word_list[0:select_number]

    vsm = []

    for row in range(len(weibo_content)):

        tf_dict = {}  # 词频TF字典
        for key in all_word_list:
            tf_dict[key] = 0

        for j in range(len(weibo_content[row])):
            try:
                tf_dict[weibo_content[row][j].split('/')[0]] += 1
            except KeyError:
                tf_dict[weibo_content[row][j].split('/')[0]] = 0

        this_line = []
        for key in all_word_list:
            this_line.append(str(tf_dict[key]))

        #每一行合并为字符串,方便写入
        vsm.append(" ".join(this_line))

    quick_write_list_to_text(vsm, write_filename)
Пример #22
0
def count_word_tf(read_filename1, read_filename2, write_filename):
    '''
    计算数据的所有词汇的词频
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''
    
    each_weibo_fenci = [] 
    all_weibo_fenci = []
        
    get_text_to_complex_list(each_weibo_fenci, read_filename1, 0)
    get_text_to_single_list(all_weibo_fenci, read_filename2)
        
    tf_dict = {}  #词频TF字典
    for key in all_weibo_fenci:
        tf_dict[key] = 0
            
    for row in range(len(each_weibo_fenci)):
        for j in range(len(each_weibo_fenci[row])):
            try:
                tf_dict[each_weibo_fenci[row][j]] += 1
            except KeyError:
                tf_dict[each_weibo_fenci[row][j]] = 0
        
    #词频列表
    value_list = []
    for key in all_weibo_fenci:
        value_list.append(tf_dict[key])
        
    # 按词频降序排序
    va = zip(all_weibo_fenci, value_list)
    va = sorted(va, key = itemgetter(1), reverse = True)    
        
    result_all = []
    for each in va:
        result_all.append(each[0] + " " + str(each[1]))
       
    quick_write_list_to_text(result_all, write_filename)
Пример #23
0
def select_top_N_words(read_filename1, read_filename2, read_filename3,
                       write_filename):
    '''
    选取前N个词作为高质量的特征词汇
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    '''
    N = 3000

    #根据词性分配权值
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}

    each_word_tf = []
    key_words = []

    select_word = []
    word_score = []

    user_dict = []

    get_text_to_complex_list(each_word_tf, read_filename1, 0)

    get_text_to_single_list(key_words, read_filename2)

    f = open(read_filename3, 'r')
    line = f.readline()
    while line:
        user_dict.append(line.split()[0])
        line = f.readline()
    f.close()

    for j in range(len(each_word_tf)):
        word_entity = each_word_tf[j][0].split('/')[0]
        word_tag = each_word_tf[j][0].split('/')[1]
        if word_entity in user_dict:
            #用户词典中的词分配高权值
            select_word.append(word_entity)
            word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0)
        elif word_entity in key_words:
            #关键词也分配高权值
            select_word.append(word_entity)
            try:
                word_score.append(
                    np.log(float(each_word_tf[j][1])) * score_dict[word_tag] *
                    1.0)
            except KeyError:
                word_score.append(float(0.0))
        else:
            #其余词汇乘以0.6
            select_word.append(word_entity)
            try:
                word_score.append(
                    np.log(float(each_word_tf[j][1])) * score_dict[word_tag] *
                    0.60)
            except KeyError:
                word_score.append(float(0.0))

    # 按权值降序排序
    sw = zip(select_word, word_score)
    sw = sorted(sw, key=itemgetter(1), reverse=True)

    result_all = []
    count_number = 1
    for each in sw:
        result_all.append(each[0] + " " + str(each[1]))
        count_number += 1
        if count_number > N:
            break

    quick_write_list_to_text(result_all, write_filename)
def cquery(keyword_list, mode, time_interval, select, read_directory1, read_directory2, write_filename):
    '''
    
    :param keyword_list:
    :param mode:
    :param time_interval:
    :param select:
    :param read_directory1: 数据总目录
    :param read_directory2: 索引目录
    :param write_filename:
    '''

    if len(time_interval) != 2:
        print "Set Time Error!"
        return
    
    if (mode != "AND") and (mode != "OR"):
        print "Mode Error!"
        return
    
    start = time_interval[0]
    end = time_interval[1]
    
    #文件个数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory2)])
    
    query_result = []
    entropy_result = []
    
    #当前目录下进行搜索
    for i in range(file_number):
        #读取时间信息
        f = open(read_directory1 + '/update_id_time/' + str(i + 1) + '.txt')
        time_lines = f.readlines()
        f.close()
        
        #当前片的最晚时间比查询设定的开始时间还早,则跳过该片
        if float(time_lines[-1].strip().split()[-1]) < start:
            #print float(time_lines[-1].strip().split()[-1])
            pass
        #当前片的最早时间比查询设定的结束时间晚,则结束
        elif float(time_lines[-1].strip().split()[-1]) > end:
            
            break
        else:
            #压缩后的数据项对应原始数据的索引
            f1 = open(read_directory2 + '/' + str(i + 1) + '.txt')
            data_index = f1.readlines()
            data_index = [int(x) for x in data_index]
            #print data_index
            f1.close()
            
            #数据的VSM表示的向量
            each_weibo_vsm = []
            get_text_to_complex_list(each_weibo_vsm, u'D:/Local/DataStreamMining/dataset/non_orthogonal/topics_data1/重构数据/' + str(i + 1) + '.txt', 0)
            
            #VSM所对应的词汇列表
            word_list = []
            f4 = open(read_directory1 + '/top_n_word/' + str(i + 1) + '.txt')
            word_lines = f4.readlines()
            f4.close()
            for each in word_lines:
                word_list.append(each.strip().split()[0])
            
            #信息熵值列表
            f5 = open(read_directory1 + '/entropy/' + str(i + 1) + '.txt')
            entropy_list = f5.readlines()
            f5.close()
            entropy_list = [float(x.strip()) for x in entropy_list]

            #每一个数据片中逐行遍历
            for j in range(len(time_lines)):
                #当前遍历时的时间
                now_t = float(time_lines[j].strip().split()[-1])
                
                if (now_t >= start) and (now_t <= end) and (j in data_index):
                    if mode == "OR":
                        flag = 0
                        for each1 in keyword_list:
                            for k in range(len(word_list)):
                                if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001):
                                    this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list))
                                    if this_message not in query_result:
                                        query_result.append(this_message)
                                        entropy_result.append(entropy_list[j])
                                    flag = 1
                                    break
                            if flag == 1:
                                break
                    else:
                        flag = 0
                        for each1 in keyword_list:
                            for k in range(len(word_list)):
                                if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001):
                                    flag += 1
                                    break
                                
                        if flag == len(keyword_list):
                            this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list))
                            if this_message not in query_result:
                                query_result.append(this_message)
                                entropy_result.append(entropy_list[j])
     
    #按熵值降序排序
    el = zip(entropy_result, query_result)
    el1 = sorted(el, key = itemgetter(0), reverse = True)
    #选择对应的行号索引
    query_result2 = []
    count_number = 1
    for each in el1:
        query_result2.append(each[1])
        count_number += 1
        if count_number > select:
            break    
    
    quick_write_list_to_text(query_result2, write_filename)
Пример #25
0
def select_top_N_words(read_directory1, read_directory2, read_filename, write_directory):
    '''
    选取前N个词作为高质量的特征词汇
    :param read_directory1:
    :param read_directory2:
    :param read_filename:
    :param write_directory:
    '''
    N = 500
    
    #根据词性分配权值
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.2, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0, "eng":0.1}
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
    
        user_dict = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
    
        f = open(read_filename, 'r')
        line = f.readline()
        while line:
            user_dict.append(line.split()[0])
            line = f.readline()
        f.close()
        
        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in user_dict:
                #用户词典中的词分配高权值
                select_word.append(word_entity)
                word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0)
            elif word_entity in key_words and word_tag != 'eng':
                #关键词也分配高权值
                select_word.append(word_entity)
                try:
                    word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))
                    
            else:
                #其余词汇乘以0.6
                select_word.append(word_entity)
                try:
                    word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.50)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
def select_top_N_words(read_directory1, read_directory2, write_directory):
    '''
    选取前N个词汇
    :param read_directory1: 所有单词tf文件目录
    :param read_directory2: 关键词文件目录
    :param write_directory: 写入目录
    '''
    #选取的词汇数目
    N = 2000
    
    #目录下的文件个数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    #权值字典,按词性分配
    score_dict = {"CC":0.0, "CD":0.0, "DT":0.2, "EX":0.0, "FW":0.3, "IN":0.0, "JJ":0.7, \
                  "JJR":0.75, "JJS":0.75, "LS":0.0, "MD":0.5, "NN":0.9, "NNS":0.9, "NNP":1.0, \
                  "NNPS":1.0, "PDT":0.0, "POS":0.0, "PRP":0.1, "PRP$":0.1, \
                  "RB":0.3, "RBR":0.35, "RBS":0.4, "RP":0.5, "SYM":0.0, "TO":0.0, "UH":0.0, \
                  "VB":0.7, "VBD":0.7, "VBG":0.7, "VBN":0.75, "VBP":0.7, "VBZ":0.7, \
                  "WDT":0.0, "WP":0.3, "WP$":0.3, "WRB":0.0, ":":0.0}
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        each_word_tf = each_word_tf[1:]  # 列表,内层2个
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
        
        for j in range(len(each_word_tf)):
            #word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split(',')[1]
            if each_word_tf[j][0] in key_words:
                select_word.append(each_word_tf[j][0])
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))  
            else:
                select_word.append(each_word_tf[j][0])
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break

        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Пример #27
0
def select_top_N_words(read_directory1, read_directory2, read_filename3,
                       write_directory):
    N = 500
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}
    user_dict = []

    f = open(read_filename3, 'r')
    line = f.readline()
    while line:
        user_dict.append(line.split()[0])
        line = f.readline()
    f.close()

    for i in range(file_number):
        each_word_tf = []
        key_words = []

        select_word = []
        word_score = []

        get_text_to_complex_list(each_word_tf,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 0)

        get_text_to_single_list(key_words,
                                read_directory2 + '/' + str(i + 1) + '.txt')

        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in user_dict:
                select_word.append(word_entity)
                word_score.append(
                    np.log(float(each_word_tf[j][1])) * 1.0 * 1.0)
            elif word_entity in key_words:
                select_word.append(word_entity)
                try:
                    word_score.append(
                        np.log(float(each_word_tf[j][1])) *
                        score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))
            else:
                select_word.append(word_entity)
                try:
                    word_score.append(
                        np.log(float(each_word_tf[j][1])) *
                        score_dict[word_tag] * 0.60)
                except KeyError:
                    word_score.append(float(0.0))

        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key=itemgetter(1), reverse=True)

        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break

        quick_write_list_to_text(result_all,
                                 write_directory + '/' + str(i + 1) + '.txt')
Пример #28
0
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename):

    # 展示5个词汇
    # 查询时选取3个词汇
    select_number = 5

    # 频繁项集聚类的结果标号,string类型,从1开始
    class_tag = []
    get_text_to_single_list(class_tag, read_filename1)

    # 聚簇数目
    cluster_number = len(set(class_tag))

    # 频繁项集,二维string类型列表
    pattern_all = []
    get_text_to_complex_list(pattern_all, read_filename2, 0)
    pattern_all = pattern_all[0 : len(class_tag)]

    # 获得聚类结果的频繁项集划分,int型二维列表
    class_partion = []
    for i in range(cluster_number):
        class_partion.append([])

    for i in range(len(class_tag)):
        for j in range(cluster_number):
            if class_tag[i] == str(j + 1):
                class_partion[j].append(i)

    # 获取全局词汇的权值
    word_weight_dict = {}
    f = open(read_filename3, "r")
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    # 获取频繁项集中所有不同的词汇
    all_word_list = []
    for each in pattern_all:
        for word in set(each).difference(all_word_list):
            all_word_list.append(word)

    # 包含某个单词的频繁项集个数——针对所有单词
    I_dict = {}
    for each in all_word_list:
        I_dict[each] = 0
        for each1 in pattern_all:
            if each in each1:
                I_dict[each] += 1

    # 包含某个单词的聚簇个数——针对所有单词
    C_dict = {}
    for each in all_word_list:
        C_dict[each] = 0
        for i in range(len(class_partion)):
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    C_dict[each] += 1
                    break

    cluster_word_list = []
    for i in range(len(class_partion)):
        # 获取该聚簇下所有不同的单词
        this_word_list = []
        for j in range(len(class_partion[i])):
            for each in pattern_all[class_partion[i][j]]:
                if each not in this_word_list:
                    this_word_list.append(each)

        # 计算每个单词在聚簇中的支持度
        sup_dict = {}

        for each in this_word_list:
            sup_dict[each] = 0
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    sup_dict[each] += 1

        word_score_list = []
        # 计算聚簇中的每个单词的权值,作为查询分类的依据
        for each in this_word_list:
            global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each]))
            word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0)
            word_score_list.append(word_score)

        # 按权值降序排序
        tw = zip(this_word_list, word_score_list)
        tw = sorted(tw, key=itemgetter(1), reverse=True)

        this_word_list = []
        word_score_list = []

        count = 0
        for each in tw:
            this_word_list.append(each[0])
            count += 1
            if count >= select_number:
                break

        cluster_word_list.append(" ".join(this_word_list))

    quick_write_list_to_text(cluster_word_list, write_filename)
Пример #29
0
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    
    #展示5个词汇
    #查询时选取3个词汇
    select_number = 5
    
    # 频繁项集聚类的结果标号,string类型,从1开始
    class_tag = []
    get_text_to_single_list(class_tag, read_filename1)

    # 聚簇数目
    cluster_number = len(set(class_tag))
    
    # 频繁项集,二维string类型列表
    pattern_all = []
    get_text_to_complex_list(pattern_all, read_filename2, 0)
    pattern_all = pattern_all[0: len(class_tag)]
    
    # 获得聚类结果的频繁项集划分,int型二维列表
    class_partion = []
    for i in range(cluster_number):
        class_partion.append([])
        
    for i in range(len(class_tag)):
        for j in range(cluster_number):
            if class_tag[i] == str(j + 1):
                class_partion[j].append(i)
    
    # 获取全局词汇的权值
    word_weight_dict = {}
    f = open(read_filename3, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()
    
    # 获取频繁项集中所有不同的词汇
    all_word_list = []
    for each in pattern_all:
        for word in set(each).difference(all_word_list):
            all_word_list.append(word)
    
    # 包含某个单词的频繁项集个数——针对所有单词
    I_dict = {}
    for each in all_word_list:
        I_dict[each] = 0
        for each1 in pattern_all:
            if each in each1:
                I_dict[each] += 1
    
    # 包含某个单词的聚簇个数——针对所有单词
    C_dict = {}
    for each in all_word_list:
        C_dict[each] = 0
        for i in range(len(class_partion)):
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    C_dict[each] += 1
                    break
    
    cluster_word_list = []   
    for i in range(len(class_partion)):
        # 获取该聚簇下所有不同的单词
        this_word_list = []
        for j in range(len(class_partion[i])):
            for each in pattern_all[class_partion[i][j]]:
                if each not in this_word_list:
                    this_word_list.append(each)
        
        # 计算每个单词在聚簇中的支持度
        sup_dict = {}
        
        for each in this_word_list:
            sup_dict[each] = 0
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    sup_dict[each] += 1
        
        word_score_list = []
        # 计算聚簇中的每个单词的权值,作为查询分类的依据
        for each in this_word_list:
            global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each])) 
            word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0)
            word_score_list.append(word_score)
        
        # 按权值降序排序
        tw = zip(this_word_list, word_score_list)
        tw = sorted(tw, key = itemgetter(1), reverse = True)
        
        this_word_list = []
        word_score_list = []
        
        count = 0
        for each in tw:
            this_word_list.append(each[0])
            count += 1
            if count >= select_number:
                break
        
        cluster_word_list.append(" ".join(this_word_list))
    
    quick_write_list_to_text(cluster_word_list, write_filename)
Пример #30
0
def oquery(keyword_list, mode, time_interval, select, read_directory, write_filename):
    '''
    
    :param keyword_list:
    :param mode:
    :param time_interval:
    :param select:
    :param read_directory: 数据总目录
    :param write_filename:
    '''

    if len(time_interval) != 2:
        print "Set Time Error!"
        return
    
    if (mode != "AND") and (mode != "OR"):
        print "Mode Error!"
        return
    
    start = time_interval[0]
    end = time_interval[1]
    
    # 文件个数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory + '/update_vsm')])
    
    query_result = []
    entropy_result = []
    
    # 当前目录下进行搜索
    for i in range(file_number):
        # 读取时间信息
        f = open(read_directory + '/update_id_time/' + str(i + 1) + '.txt')
        time_lines = f.readlines()
        f.close()
        
        # 当前片的最晚时间比查询设定的开始时间还早,则跳过该片
        if float(time_lines[-1].strip().split()[-1]) < start:
            pass
        # 当前片的最早时间比查询设定的结束时间晚,则结束
        elif float(time_lines[-1].strip().split()[-1]) > end:
            break
        else:
            # 数据的VSM表示的向量
            each_weibo_vsm = []
            get_text_to_complex_list(each_weibo_vsm, read_directory + '/update_vsm/' + str(i + 1) + '.txt', 0)
            
            # VSM所对应的词汇列表
            word_list = []
            f4 = open(read_directory + '/top_n_word/' + str(i + 1) + '.txt')
            word_lines = f4.readlines()
            f4.close()
            for each in word_lines:
                word_list.append(each.strip().split()[0])
            
            # 信息熵值列表
            f5 = open(read_directory + '/entropy/' + str(i + 1) + '.txt')
            entropy_list = f5.readlines()
            f5.close()
            entropy_list = [float(x.strip()) for x in entropy_list]

            # 每一个数据片中逐行遍历
            for j in range(len(time_lines)):
                # 当前遍历时的时间
                now_t = float(time_lines[j].strip().split()[-1])
                
                if (now_t >= start) and (now_t <= end):
                    if mode == "OR":
                        flag = 0
                        for each1 in keyword_list:
                            for k in range(len(word_list)):
                                if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001):
                                    this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list))
                                    if this_message not in query_result:
                                        query_result.append(this_message)
                                        entropy_result.append(entropy_list[j])
                                    flag = 1
                                    break
                            if flag == 1:
                                break
                    else:
                        flag = 0
                        for each1 in keyword_list:
                            for k in range(len(word_list)):
                                if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001):
                                    flag += 1
                                    break
                                
                        if flag == len(keyword_list):
                            this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list))
                            if this_message not in query_result:
                                query_result.append(this_message)
                                entropy_result.append(entropy_list[j])
     
    # 按熵值降序排序
    el = zip(entropy_result, query_result)
    el1 = sorted(el, key=itemgetter(0), reverse=True)
    # 选择对应的行号索引
    query_result2 = []
    count_number = 1
    for each in el1:
        query_result2.append(each[1])
        count_number += 1
        if count_number > select:
            break    
    
    quick_write_list_to_text(query_result2, write_filename)