예제 #1
0
def filter_phi(phi_path=constant.lda_phi,
               phi_after_path=constant.lda_phi_after,
               phi_threadhold=8):
    word_count = 0
    topic_count = 0
    with open(phi_path) as origin:
        for i, line in enumerate(origin):
            if i == 0:
                items = line.strip().split(' ')
                word_count = len(items)
            topic_count += 1
    phi = sparse.lil_matrix((topic_count, word_count), dtype=float)
    with open(phi_path) as origin:
        for i, line in enumerate(origin):
            items = line.strip().split()
            for j, it in enumerate(items):
                phi[i, j] = float(it)

    for j in xrange(word_count):
        origin_col = []
        for i in xrange(topic_count):
            origin_col.append(phi[i, j])
        slist = sorted(origin_col, reverse=True)[0:phi_threadhold]
        for i in xrange(topic_count):
            if origin_col[i] not in slist:
                phi[i, j] = 0
    with open(phi_after_path, 'w') as file:
        for i in xrange(topic_count):
            for j in xrange(word_count):
                file.write(str(phi[i, j]) + ' ')
            file.write('\n')
예제 #2
0
def theta_sperator(theta_after_path=constant.lda_theta_after,
                   sperator_path=constant.lda_fcn):
    topic_count = constant.lda_topic_count
    doc_count = constant.file.get_doc_count()
    theta = sparse.lil_matrix((doc_count, topic_count), dtype=float)
    with open(theta_after_path) as file:
        for i, line in enumerate(file):
            items = line.strip().split(' ')
            for j, item in enumerate(items):
                theta[i, j] = float(item)

    doc_list = constant.file.get_docmap(True)

    for j in xrange(topic_count):
        with open(sperator_path + str(j), 'w') as file:
            for i in xrange(doc_count):
                if theta[i, j] != 0:
                    file.write(doc_list[i] + ' ' + str(theta[i, j]) + '\n')
    return theta
예제 #3
0
def print_model(model):
    # 输出theta
    theta = model.doc_topic_
    with open(constant.lda_theta, 'w') as file:
        for i in theta:
            for j in i:
                file.write(str(j) + ' ')
            file.write('\n')

    # 输出phi
    phi = model.topic_word_
    with open(constant.lda_phi, 'w') as file:
        for i in phi:
            for j in i:
                file.write(str(j) + ' ')
            file.write('\n')
예제 #4
0
def print_listmatrix(matrix, file_name, need_title_count=True):
    row = len(matrix)
    col = len(matrix[0])
    with open(file_name, 'w') as file:
        if (need_title_count):
            file.write(str(row) + '*' + str(col) + '\n')
        for i in xrange(row):
            print str(float(i) / row) + '\r',
            for j in xrange(col):
                val = matrix[i][j]
                if val != 0:
                    file.write(str(j) + ':' + str(val) + ' ')
            file.write('\n')
    del matrix
    gc.collect()
예제 #5
0
def __print_info():
    list = constant.file.get_relation()
    word_map = {}
    count = 0
    for line in list:
        for word in line:
            if word_map.has_key(word):
                pass
            else:
                word_map[word] = count
                count += 1

    # 输出wordmap
    with open(constant.data_wordmap, 'w') as file:
        file.write(str(len(word_map)) + '\n')
        for it in word_map.items():
            file.write(it[0])
            file.write(' ')
            file.write(str(it[1]))
            file.write('\n')

    # 输出其他信息
    with open(constant.data_other, 'w') as file:
        other = {}
        other['ndocs'] = len(constant.file.get_docmap())
        other['nwords'] = len(constant.file.get_wordmap())
        s = json.dumps(other,indent=4)
        file.write(s)