示例#1
0
def load_data():
    read = FileOperator.f_open(Const.UNIQ_SRC_FILE)
    unique_char_set = read[-1].split(",")
    w2v = Word2Vec(Const.W2V_SRC_FILE, Const.W2V_WEIGHT_FILE,
                   Const.WORD_FEAT_LEN, "load")

    data_array = []
    for word in unique_char_set:
        data_array.append(w2v.str_to_vector(word))
    kmeans = MyKmeans(Const.NUM_OF_CLUSTER, Const.KMEANS_SAVE_FILE, data_array,
                      "load")
    return unique_char_set, w2v, kmeans
示例#2
0
def learn_word():
    print("src file: ", Const.SRC_FILE)
    sentence_list = FileOperator.f_open(Const.SRC_FILE)
    sentence_list = StringOperator.split_sentence(sentence_list)
    prob_state, w2v, kmeans, _ = load_data()

    cnt = 0
    for sentence in sentence_list:
        sys.stdout.write("\r progress: %d / %d" % (cnt, len(sentence_list)))
        sys.stdout.flush()
        for i in range(len(sentence) - 2):
            vec = w2v.str_to_vector(sentence[i]).reshape(1, -1)
            cluster = kmeans.get_cluster(vec)
            next_vec = w2v.str_to_vector(sentence[i + 1]).reshape(1, -1)
            next_cluster = kmeans.get_cluster(next_vec)
            prob_state.count_up_trainsition(cluster, next_cluster)
        cnt += 1
    prob_state.save_prob(Const.PROB_FILE)
    print()
    print("end")
示例#3
0
def init_data():
    print("src file: ", Const.SRC_FILE)
    sentence_list = FileOperator.f_open(Const.SRC_FILE)
    sentence_list = StringOperator.split_sentence(sentence_list)

    flatten_word_list = StringOperator.array_string_to_flatten(sentence_list)
    unique_char_set = StringOperator.array_char_to_unique(flatten_word_list)
    print("unique char set len :", len(unique_char_set))
    FileOperator.f_write(Const.UNIQ_SRC_FILE, unique_char_set)
    print("save unique file: ", Const.UNIQ_SRC_FILE)
    prob_state = ProbabilityState(Const.NUM_OF_CLUSTER, Const.PROB_FILE,
                                  "init")
    prob_state.save_prob(Const.PROB_FILE)
    w2v = Word2Vec(Const.W2V_SRC_FILE, Const.W2V_WEIGHT_FILE,
                   Const.WORD_FEAT_LEN, "init")

    data_array = []
    for word in unique_char_set:
        data_array.append(w2v.str_to_vector(word))
    MyKmeans(Const.NUM_OF_CLUSTER, Const.KMEANS_SAVE_FILE, data_array, "init")
示例#4
0
def call_sse():
    read = FileOperator.f_open(Const.UNIQ_SRC_FILE)
    w2v = Word2Vec(Const.W2V_SRC_FILE, Const.W2V_WEIGHT_FILE,
                   Const.WORD_FEAT_LEN, "load")

    unique_char_set = read[-1].split(",")
    print("number of unique word:", len(unique_char_set))
    data_array = []
    for word in unique_char_set:
        data_array.append(w2v.str_to_vector(word))

    sse_list = []
    num_of_cluster_list = range(100, 2000, 100)
    for num_of_cluster in num_of_cluster_list:
        print(num_of_cluster)
        kmeans = MyKmeans(num_of_cluster, Const.KMEANS_SAVE_FILE, data_array,
                          "init")
        print(kmeans.get_sse())
        sse_list.append(kmeans.get_sse())

    plt.plot(num_of_cluster_list, sse_list, marker='o')
    # plt.show()
    plt.savefig(Const.SSE_IMG)