def test_sk_svm_on_text_data(): train_path = os.path.join(get_train_data_dir(), "train_tfidf.txt") model_path = os.path.join(get_model_dir(), "sk_libsvm.pkl") # clf = svm.SVC(C=100, kernel='linear') clf = svm.SVC() print(clf) start = time.time() X, y = load_svmlight_file(train_path) end = time.time() last = end - start print("Load lasts %.2fs" % last) start = time.time() clf.fit(X, y) end = time.time() last = end - start print("Train lasts %.2fs" % last) joblib.dump(clf, model_path) print("Save model to %s" % model_path)
def train_pretrained_fast_text(): total_num_train = 669589 # num_lines of thucnews_train_vec.txt total_num_val = 83316 # num_lines of thucnews_val_vec.txt num_classes = 14 epochs = 100 #epochs = 3 shuffle_buffer_size = 1024 * 2 batch_size = 32 patience = 10 # for early stopping data_dir = os.path.join(get_data_dir(), "text_classification") train_path = os.path.join(data_dir, "thucnews_train_vec.txt") val_path = os.path.join(data_dir, "thucnews_val_vec.txt") log_dir = os.path.join(get_log_dir(), "fast_text") checkpoint_path = os.path.join(get_model_dir(), "fast_text", "ckpt") history_path = os.path.join(get_log_dir(), "history", "fast_text.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 # === tf.data.Dataset train_dataset = get_pretrained_dataset( data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size) val_dataset = get_pretrained_dataset( data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size) # === model model = PretrainedFastText(num_classes=num_classes) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) # loss loss = tf.keras.losses.SparseCategoricalCrossentropy() model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard(log_dir=log_dir) callbacks.append(tensorboard_cb) checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return history
from sklearn.naive_bayes import MultinomialNB from jiangziya.utils.config import get_train_data_dir, get_model_dir import os from sklearn.datasets import load_svmlight_file import time, joblib if __name__ == '__main__': train_path = os.path.join(get_train_data_dir(), "train_libsvm.txt") model_path = os.path.join(get_model_dir(), "sk_naive_bayes.pkl") clf = MultinomialNB() print(clf) start = time.time() X, y = load_svmlight_file(train_path) end = time.time() last = end - start print("Load lasts %.2f" % last) start = time.time() clf.fit(X, y) end = time.time() last = end - start print("Train lasts %.2f" % last) joblib.dump(clf, model_path) print("Save model to %s" % model_path)
pred_label = np.argmax(softmax, axis=1) fw.write( str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt) if __name__ == '__main__': num_classes = 14 max_seq_len = 350 model_name = "pretrained_text_cnn" checkpoint_dir = os.path.join(get_model_dir(), model_name) data_dir = os.path.join(get_data_dir(), "text_classification") #val_path = os.path.join(data_dir, "thucnews_val_vec.txt") test_path = os.path.join(data_dir, "thucnews_test_seg.txt") test_result_path = os.path.join(data_dir, "thucnews_test_" + model_name + ".txt") word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl") # === Load word_vec_dict word_vec_dict = load_word_vector_dict( word_vector_dict_path=word_vector_dict_path) print("#word_vec_dict = %d" % len(word_vec_dict)) # === Build and compile model.
vecs_str = ','.join(list(map(lambda x: str(x), vecs))) fw.write(label + '\t' + vecs_str + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt) if __name__ == '__main__': #file_type = "thucnews_train" #file_type = "thucnews_val" file_type = "thucnews_test" text_path = os.path.join(get_data_dir(), "text_classification", file_type + "_seg.txt") vec_path = os.path.join(get_data_dir(), "text_classification", file_type + "_vec.txt") word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl") # === Load word_vec_dict word_vec_dict = load_word_vector_dict( word_vector_dict_path=word_vector_dict_path) print("#word_vec_dict = %d" % len(word_vec_dict)) # === Compute vectors for file, merge title and text as ONE file. compute_vectors(text_path=text_path, vec_path=vec_path, word_vec_dict=word_vec_dict) print("Write done! %s" % vec_path)
dtype=np.float32).reshape((-1, 300)) # [1, num_classes=14] softmax = model(inputs) pred_label = np.argmax(softmax, axis=1) fw.write( str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt) if __name__ == '__main__': num_classes = 14 checkpoint_dir = os.path.join(get_model_dir(), "fast_text") data_dir = os.path.join(get_data_dir(), "text_classification") #val_path = os.path.join(data_dir, "thucnews_val_vec.txt") test_path = os.path.join(data_dir, "thucnews_test_vec.txt") test_result_path = os.path.join(data_dir, "thucnews_test_fast_text.txt") # === Build and compile model. model = PretrainedFastText(num_classes=num_classes) optimizer = tf.keras.optimizers.Adam(0.001) loss = tf.keras.losses.SparseCategoricalCrossentropy() model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # === Load weights. checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
def train_model(): total_num_train = 669589 # num_lines of thucnews_train_seg.txt total_num_val = 83316 # num_lines of thucnews_test_seg.txt max_seq_len = 350 # avg #words in sequence = 380, remove stop-words will go to ~350. filters = 100 kernel_size = 5 dense_units = 128 # 3 * filters ==> dense_units ==> 14 dropout_keep_ratio = 0.5 num_classes = 14 epochs = 100 #epochs = 3 shuffle_buffer_size = 1024 * 2 batch_size = 32 patience = 10 # for early stopping model_name = "pretrained_text_cnn" data_dir = os.path.join(get_data_dir(), "text_classification") train_path = os.path.join(data_dir, "thucnews_train_seg.txt") val_path = os.path.join(data_dir, "thucnews_test_seg.txt") log_dir = os.path.join(get_log_dir(), model_name) checkpoint_path = os.path.join(get_model_dir(), model_name, "ckpt") history_path = os.path.join(get_log_dir(), "history", model_name + ".pkl") word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl") print('word_vector', word_vector_dict_path) # === Load word_vec_dict word_vec_dict = load_word_vector_dict( word_vector_dict_path=word_vector_dict_path) print("#word_vec_dict = %d" % len(word_vec_dict)) num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 # === tf.data.Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, max_seq_len=max_seq_len, word_vec_dict=word_vec_dict) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, max_seq_len=max_seq_len, word_vec_dict=word_vec_dict) # === model model = PretrainedTextCNN(num_classes=num_classes, filters=filters, kernel_size=kernel_size, dense_units=dense_units, dropout_keep_ratio=dropout_keep_ratio, max_seq_len=max_seq_len) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) # loss loss = tf.keras.losses.SparseCategoricalCrossentropy() model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard(log_dir=log_dir) callbacks.append(tensorboard_cb) checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return history
from jiangziya.utils.config import get_train_data_dir, get_model_dir import os from sklearn.datasets import load_svmlight_file import time, joblib if __name__ == '__main__': test_path = os.path.join(get_train_data_dir(), "test_tfidf.txt") test_result_path = os.path.join(get_train_data_dir(), "thucnews_test_sk_libsvm.txt") model_path = os.path.join(get_model_dir(), "sk_libsvm.pkl") clf = joblib.load(model_path) print(clf) print("Load model done!") start = time.time() X, y = load_svmlight_file(test_path) end = time.time() last = end - start print("Load data lasts %.2fs" % last) start = time.time() y_pred_list = clf.predict(X) end = time.time() last = end - start print("Test lasts %.2fs" % last) with open(test_result_path, 'w', encoding='utf-8') as fw: line_cnt = 0 for y_true, y_pred in zip(y, y_pred_list): fw.write(str(y_true) + '\t' + str(y_pred) + '\n')
# header line: 365076 300; num_words, embedding_dim # remain lines: word \t vec_1 \s vec_2 ... \s vec_300 # Output: {word: vec in numpy.ndarray} word_vector_dict = {} with open(word_vector_path, 'r', encoding='utf-8') as f: header = False for line in f: if not header: header = True continue buf = line[:-1].strip().split(' ') word = buf[0] # [300, ] vec = np.array(list(map(lambda x: float(x), buf[1:])), dtype=np.float32) word_vector_dict[word] = vec return word_vector_dict if __name__ == '__main__': word_vector_path = os.path.join(get_model_dir(), "sgns.sogou.char") word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl") word_vector_dict = get_word_vector_dict(word_vector_path=word_vector_path) with open(word_vector_dict_path, 'wb') as fw: pickle.dump(word_vector_dict, fw) print("Write done! %s" % word_vector_dict_path)