예제 #1
0
def test_sk_svm_on_text_data():
    train_path = os.path.join(get_train_data_dir(), "train_tfidf.txt")
    model_path = os.path.join(get_model_dir(), "sk_libsvm.pkl")

    # clf = svm.SVC(C=100, kernel='linear')
    clf = svm.SVC()
    print(clf)

    start = time.time()
    X, y = load_svmlight_file(train_path)
    end = time.time()
    last = end - start
    print("Load lasts %.2fs" % last)

    start = time.time()
    clf.fit(X, y)
    end = time.time()
    last = end - start
    print("Train lasts %.2fs" % last)

    joblib.dump(clf, model_path)
    print("Save model to %s" % model_path)
예제 #2
0
def train_pretrained_fast_text():
    total_num_train = 669589  # num_lines of thucnews_train_vec.txt
    total_num_val = 83316  # num_lines of thucnews_val_vec.txt

    num_classes = 14
    epochs = 100
    #epochs = 3
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    patience = 10  # for early stopping

    data_dir = os.path.join(get_data_dir(), "text_classification")
    train_path = os.path.join(data_dir, "thucnews_train_vec.txt")
    val_path = os.path.join(data_dir, "thucnews_val_vec.txt")

    log_dir = os.path.join(get_log_dir(), "fast_text")
    checkpoint_path = os.path.join(get_model_dir(), "fast_text", "ckpt")
    history_path = os.path.join(get_log_dir(), "history", "fast_text.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    # === tf.data.Dataset
    train_dataset = get_pretrained_dataset(
        data_path=train_path,
        epochs=epochs,
        shuffle_buffer_size=shuffle_buffer_size,
        batch_size=batch_size)

    val_dataset = get_pretrained_dataset(
        data_path=val_path,
        epochs=epochs,
        shuffle_buffer_size=shuffle_buffer_size,
        batch_size=batch_size)

    # === model
    model = PretrainedFastText(num_classes=num_classes)

    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=patience,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(log_dir=log_dir)
    callbacks.append(tensorboard_cb)

    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return history
예제 #3
0
from sklearn.naive_bayes import MultinomialNB
from jiangziya.utils.config import get_train_data_dir, get_model_dir
import os
from sklearn.datasets import load_svmlight_file
import time, joblib

if __name__ == '__main__':
    train_path = os.path.join(get_train_data_dir(), "train_libsvm.txt")
    model_path = os.path.join(get_model_dir(), "sk_naive_bayes.pkl")

    clf = MultinomialNB()
    print(clf)

    start = time.time()
    X, y = load_svmlight_file(train_path)
    end = time.time()
    last = end - start
    print("Load lasts %.2f" % last)

    start = time.time()
    clf.fit(X, y)
    end = time.time()
    last = end - start
    print("Train lasts %.2f" % last)

    joblib.dump(clf, model_path)
    print("Save model to %s" % model_path)
                pred_label = np.argmax(softmax, axis=1)
                fw.write(
                    str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)


if __name__ == '__main__':
    num_classes = 14
    max_seq_len = 350
    model_name = "pretrained_text_cnn"

    checkpoint_dir = os.path.join(get_model_dir(), model_name)

    data_dir = os.path.join(get_data_dir(), "text_classification")
    #val_path = os.path.join(data_dir, "thucnews_val_vec.txt")
    test_path = os.path.join(data_dir, "thucnews_test_seg.txt")
    test_result_path = os.path.join(data_dir,
                                    "thucnews_test_" + model_name + ".txt")

    word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl")

    # === Load word_vec_dict
    word_vec_dict = load_word_vector_dict(
        word_vector_dict_path=word_vector_dict_path)
    print("#word_vec_dict = %d" % len(word_vec_dict))

    # === Build and compile model.
                vecs_str = ','.join(list(map(lambda x: str(x), vecs)))
                fw.write(label + '\t' + vecs_str + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)


if __name__ == '__main__':
    #file_type = "thucnews_train"
    #file_type = "thucnews_val"
    file_type = "thucnews_test"
    text_path = os.path.join(get_data_dir(), "text_classification",
                             file_type + "_seg.txt")
    vec_path = os.path.join(get_data_dir(), "text_classification",
                            file_type + "_vec.txt")

    word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl")

    # === Load word_vec_dict
    word_vec_dict = load_word_vector_dict(
        word_vector_dict_path=word_vector_dict_path)
    print("#word_vec_dict = %d" % len(word_vec_dict))

    # === Compute vectors for file, merge title and text as ONE file.
    compute_vectors(text_path=text_path,
                    vec_path=vec_path,
                    word_vec_dict=word_vec_dict)
    print("Write done! %s" % vec_path)
예제 #6
0
                                  dtype=np.float32).reshape((-1, 300))
                # [1, num_classes=14]
                softmax = model(inputs)

                pred_label = np.argmax(softmax, axis=1)
                fw.write(
                    str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)


if __name__ == '__main__':
    num_classes = 14
    checkpoint_dir = os.path.join(get_model_dir(), "fast_text")

    data_dir = os.path.join(get_data_dir(), "text_classification")
    #val_path = os.path.join(data_dir, "thucnews_val_vec.txt")
    test_path = os.path.join(data_dir, "thucnews_test_vec.txt")
    test_result_path = os.path.join(data_dir, "thucnews_test_fast_text.txt")

    # === Build and compile model.
    model = PretrainedFastText(num_classes=num_classes)
    optimizer = tf.keras.optimizers.Adam(0.001)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # === Load weights.
    checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
def train_model():
    total_num_train = 669589  # num_lines of thucnews_train_seg.txt
    total_num_val = 83316  # num_lines of thucnews_test_seg.txt

    max_seq_len = 350  # avg #words in sequence = 380, remove stop-words will go to ~350.

    filters = 100
    kernel_size = 5
    dense_units = 128  # 3 * filters ==> dense_units ==> 14
    dropout_keep_ratio = 0.5

    num_classes = 14
    epochs = 100
    #epochs = 3
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    patience = 10  # for early stopping

    model_name = "pretrained_text_cnn"

    data_dir = os.path.join(get_data_dir(), "text_classification")
    train_path = os.path.join(data_dir, "thucnews_train_seg.txt")
    val_path = os.path.join(data_dir, "thucnews_test_seg.txt")

    log_dir = os.path.join(get_log_dir(), model_name)
    checkpoint_path = os.path.join(get_model_dir(), model_name, "ckpt")
    history_path = os.path.join(get_log_dir(), "history", model_name + ".pkl")

    word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl")
    print('word_vector', word_vector_dict_path)
    # === Load word_vec_dict
    word_vec_dict = load_word_vector_dict(
        word_vector_dict_path=word_vector_dict_path)
    print("#word_vec_dict = %d" % len(word_vec_dict))

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    # === tf.data.Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                max_seq_len=max_seq_len,
                                word_vec_dict=word_vec_dict)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              max_seq_len=max_seq_len,
                              word_vec_dict=word_vec_dict)

    # === model
    model = PretrainedTextCNN(num_classes=num_classes,
                              filters=filters,
                              kernel_size=kernel_size,
                              dense_units=dense_units,
                              dropout_keep_ratio=dropout_keep_ratio,
                              max_seq_len=max_seq_len)

    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=patience,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(log_dir=log_dir)
    callbacks.append(tensorboard_cb)

    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return history
예제 #8
0
from jiangziya.utils.config import get_train_data_dir, get_model_dir
import os
from sklearn.datasets import load_svmlight_file
import time, joblib

if __name__ == '__main__':
    test_path = os.path.join(get_train_data_dir(), "test_tfidf.txt")
    test_result_path = os.path.join(get_train_data_dir(),
                                    "thucnews_test_sk_libsvm.txt")
    model_path = os.path.join(get_model_dir(), "sk_libsvm.pkl")

    clf = joblib.load(model_path)
    print(clf)
    print("Load model done!")

    start = time.time()
    X, y = load_svmlight_file(test_path)
    end = time.time()
    last = end - start
    print("Load data lasts %.2fs" % last)

    start = time.time()
    y_pred_list = clf.predict(X)
    end = time.time()
    last = end - start
    print("Test lasts %.2fs" % last)

    with open(test_result_path, 'w', encoding='utf-8') as fw:
        line_cnt = 0
        for y_true, y_pred in zip(y, y_pred_list):
            fw.write(str(y_true) + '\t' + str(y_pred) + '\n')
예제 #9
0
    # header line: 365076 300; num_words, embedding_dim
    # remain lines: word \t vec_1 \s vec_2 ... \s vec_300

    # Output: {word: vec in numpy.ndarray}
    word_vector_dict = {}
    with open(word_vector_path, 'r', encoding='utf-8') as f:
        header = False
        for line in f:
            if not header:
                header = True
                continue

            buf = line[:-1].strip().split(' ')
            word = buf[0]
            # [300, ]
            vec = np.array(list(map(lambda x: float(x), buf[1:])),
                           dtype=np.float32)
            word_vector_dict[word] = vec

    return word_vector_dict


if __name__ == '__main__':
    word_vector_path = os.path.join(get_model_dir(), "sgns.sogou.char")
    word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl")

    word_vector_dict = get_word_vector_dict(word_vector_path=word_vector_path)

    with open(word_vector_dict_path, 'wb') as fw:
        pickle.dump(word_vector_dict, fw)
        print("Write done! %s" % word_vector_dict_path)