예제 #1
0
def test_sk_svm_on_sample_data():
    # data from mllib
    train_data_path = os.path.join(get_data_dir(), "mllib", "sample_svm_train")
    test_data_path = os.path.join(get_data_dir(), "mllib", "sample_svm_test")

    start = time.time()
    #clf = svm.SVC(kernel='linear', C=1)#acc=0.50, k<x, x'> = x^T x'
    #clf = svm.SVC(kernel='poly', C=1, gamma='scale') #acc=0.61, k<x, x'> = (\gamma x^Tx' + r)^d
    clf = svm.SVC(kernel='rbf', C=1, gamma='scale') #acc=0.62, k<x, x'> = exp(-gamma ||x - x'||^2)
    #clf = svm.SVC(kernel='sigmoid', C=10) # acc=0.54, k<x, x'> = tanh (gamma x^Tx' + r)
    print(clf)
    #X, y = load_svmlight_file(train_data_path)
    X, y = load_label_data(data_path=train_data_path)
    clf.fit(X, y)
    end = time.time()
    last = end - start
    print("Train lasts %.2fs" % last)

    X, y_true = load_label_data(data_path=test_data_path)
    y_pred = clf.predict(X)
    print(classification_report(y_true, y_pred))
예제 #2
0
        .shuffle(buffer_size=shuffle_buffer_size)\
        .batch(batch_size=batch_size)


def get_dataset(data_path=None,
                epochs=10,
                shuffle_buffer_size=1024,
                batch_size=16,
                word2id_dict=None):
    return dataset_generator(data_path=data_path,
                             epochs=epochs,
                             shuffle_buffer_size=shuffle_buffer_size,
                             batch_size=batch_size,
                             word2id_dict=word2id_dict)


if __name__ == "__main__":
    data_dir = os.path.join(get_data_dir(), "text_classification")
    train_path = os.path.join(data_dir, "thucnews_train_vec.txt")

    # TODO
    word2id_dict =

    train_dataset = get_dataset(data_path=train_path,
                                batch_size=4,
                                word2id_dict=word2id_dict)

    # inputs: [None, 300]
    # label: [None, ]
    for i, (inputs, labels) in zip(range(2), train_dataset):
        print(i, inputs.shape, labels.shape)
예제 #3
0
def train_pretrained_fast_text():
    total_num_train = 669589  # num_lines of thucnews_train_vec.txt
    total_num_val = 83316  # num_lines of thucnews_val_vec.txt

    num_classes = 14
    epochs = 100
    #epochs = 3
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    patience = 10  # for early stopping

    data_dir = os.path.join(get_data_dir(), "text_classification")
    train_path = os.path.join(data_dir, "thucnews_train_vec.txt")
    val_path = os.path.join(data_dir, "thucnews_val_vec.txt")

    log_dir = os.path.join(get_log_dir(), "fast_text")
    checkpoint_path = os.path.join(get_model_dir(), "fast_text", "ckpt")
    history_path = os.path.join(get_log_dir(), "history", "fast_text.pkl")

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    # === tf.data.Dataset
    train_dataset = get_pretrained_dataset(
        data_path=train_path,
        epochs=epochs,
        shuffle_buffer_size=shuffle_buffer_size,
        batch_size=batch_size)

    val_dataset = get_pretrained_dataset(
        data_path=val_path,
        epochs=epochs,
        shuffle_buffer_size=shuffle_buffer_size,
        batch_size=batch_size)

    # === model
    model = PretrainedFastText(num_classes=num_classes)

    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=patience,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(log_dir=log_dir)
    callbacks.append(tensorboard_cb)

    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return history
예제 #4
0
                fw.write(label + '\t' + seg_title + '\t' + seg_text + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)

            print(line_cnt)


if __name__ == '__main__':
    #file_type = "thucnews_train"
    #file_type = "thucnews_val"
    file_type = "thucnews_test"

    file_name = "shuf_" + file_type + ".txt"  # use shuffled file to seg
    seg_file_name = file_type + "_seg.txt"
    file_path = os.path.join(get_data_dir(), "text_classification", file_name)
    seg_path = os.path.join(get_data_dir(), "text_classification",
                            seg_file_name)

    #stopwords_path = os.path.join(get_data_dir(), "nlp", "baidu_stopwords.txt")
    stopwords_dict_path = os.path.join(get_data_dir(), "nlp",
                                       "stopwords_dict.pkl")

    # === Load stopwords_dict
    stopwords_dict = load_stopwords_dict(
        stopwords_dict_path=stopwords_dict_path)
    print("#stopwords_dict = %d" % len(stopwords_dict))

    # === Segmentation
    segmentation_file(file_path=file_path,
                      seg_path=seg_path,
                    continue
                vecs /= num_word

                vecs_str = ','.join(list(map(lambda x: str(x), vecs)))
                fw.write(label + '\t' + vecs_str + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)


if __name__ == '__main__':
    #file_type = "thucnews_train"
    #file_type = "thucnews_val"
    file_type = "thucnews_test"
    text_path = os.path.join(get_data_dir(), "text_classification",
                             file_type + "_seg.txt")
    vec_path = os.path.join(get_data_dir(), "text_classification",
                            file_type + "_vec.txt")

    word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl")

    # === Load word_vec_dict
    word_vec_dict = load_word_vector_dict(
        word_vector_dict_path=word_vector_dict_path)
    print("#word_vec_dict = %d" % len(word_vec_dict))

    # === Compute vectors for file, merge title and text as ONE file.
    compute_vectors(text_path=text_path,
                    vec_path=vec_path,
                    word_vec_dict=word_vec_dict)
예제 #6
0
    stopwords_dict = {}

    for stopwords_path in stopwords_path_list:
        with open(stopwords_path, 'r', encoding='utf-8') as fr:
            for line in fr:
                stopword = line[:-1]
                stopwords_dict[stopword] = True

    # English punctuation
    for char in string.punctuation:
        stopwords_dict[char] = True

    # Chinese punctuation
    for char in hanzi.punctuation:
        stopwords_dict[char] = True

    with open(stopwords_dict_path, 'wb') as fw:
        pickle.dump(stopwords_dict, fw)
        print("Write done! %s" % stopwords_dict_path)


if __name__ == '__main__':
    baidu_stopwords_path = os.path.join(get_data_dir(), "nlp",
                                        "baidu_stopwords.txt")
    stopwords_dict_path = os.path.join(get_data_dir(), "nlp",
                                       "stopwords_dict.pkl")

    generate_stopwords_dict(stopwords_path_list=[baidu_stopwords_path],
                            stopwords_dict_path=stopwords_dict_path)
def train_model():
    total_num_train = 669589  # num_lines of thucnews_train_seg.txt
    total_num_val = 83316  # num_lines of thucnews_test_seg.txt

    max_seq_len = 350  # avg #words in sequence = 380, remove stop-words will go to ~350.

    filters = 100
    kernel_size = 5
    dense_units = 128  # 3 * filters ==> dense_units ==> 14
    dropout_keep_ratio = 0.5

    num_classes = 14
    epochs = 100
    #epochs = 3
    shuffle_buffer_size = 1024 * 2
    batch_size = 32
    patience = 10  # for early stopping

    model_name = "pretrained_text_cnn"

    data_dir = os.path.join(get_data_dir(), "text_classification")
    train_path = os.path.join(data_dir, "thucnews_train_seg.txt")
    val_path = os.path.join(data_dir, "thucnews_test_seg.txt")

    log_dir = os.path.join(get_log_dir(), model_name)
    checkpoint_path = os.path.join(get_model_dir(), model_name, "ckpt")
    history_path = os.path.join(get_log_dir(), "history", model_name + ".pkl")

    word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl")
    print('word_vector', word_vector_dict_path)
    # === Load word_vec_dict
    word_vec_dict = load_word_vector_dict(
        word_vector_dict_path=word_vector_dict_path)
    print("#word_vec_dict = %d" % len(word_vec_dict))

    num_train_batch = total_num_train // batch_size + 1
    num_val_batch = total_num_val // batch_size + 1

    # === tf.data.Dataset
    train_dataset = get_dataset(data_path=train_path,
                                epochs=epochs,
                                shuffle_buffer_size=shuffle_buffer_size,
                                batch_size=batch_size,
                                max_seq_len=max_seq_len,
                                word_vec_dict=word_vec_dict)

    val_dataset = get_dataset(data_path=val_path,
                              epochs=epochs,
                              shuffle_buffer_size=shuffle_buffer_size,
                              batch_size=batch_size,
                              max_seq_len=max_seq_len,
                              word_vec_dict=word_vec_dict)

    # === model
    model = PretrainedTextCNN(num_classes=num_classes,
                              filters=filters,
                              kernel_size=kernel_size,
                              dense_units=dense_units,
                              dropout_keep_ratio=dropout_keep_ratio,
                              max_seq_len=max_seq_len)

    # optimizer
    optimizer = tf.keras.optimizers.Adam(0.001)

    # loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # callbacks
    callbacks = []

    early_stopping_cb = EarlyStopping(monitor='val_loss',
                                      patience=patience,
                                      restore_best_weights=True)
    callbacks.append(early_stopping_cb)

    tensorboard_cb = TensorBoard(log_dir=log_dir)
    callbacks.append(tensorboard_cb)

    checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,
                                    save_best_only=True)
    callbacks.append(checkpoint_cb)

    # === Train
    history = model.fit(train_dataset,
                        epochs=epochs,
                        steps_per_epoch=num_train_batch,
                        validation_data=val_dataset,
                        validation_steps=num_val_batch,
                        callbacks=callbacks)

    print(model.summary())

    return history
예제 #8
0
    with open(text_path, 'r', encoding='utf-8') as f:
        for line in f:
            r = np.random.random()
            if r < train_ratio:
                fw_train.write(line)
            elif r >= train_ratio and r < train_ratio + val_ratio:
                fw_val.write(line)
            else:
                fw_test.write(line)

    fw_train.close()
    fw_val.close()
    fw_test.close()


if __name__ == '__main__':
    text_path = os.path.join(get_data_dir(), "thucnews.txt")
    train_path = os.path.join(get_data_dir(), "thucnews_train.txt")
    val_path = os.path.join(get_data_dir(), "thucnews_val.txt")
    test_path = os.path.join(get_data_dir(), "thucnews_test.txt")

    train_ratio = 0.8
    val_ratio = 0.1
    split_train_val_test(text_path,
                         train_path,
                         val_path,
                         test_path,
                         train_ratio=train_ratio,
                         val_ratio=val_ratio)

    print("Write done!", train_path)