def test_sk_svm_on_sample_data(): # data from mllib train_data_path = os.path.join(get_data_dir(), "mllib", "sample_svm_train") test_data_path = os.path.join(get_data_dir(), "mllib", "sample_svm_test") start = time.time() #clf = svm.SVC(kernel='linear', C=1)#acc=0.50, k<x, x'> = x^T x' #clf = svm.SVC(kernel='poly', C=1, gamma='scale') #acc=0.61, k<x, x'> = (\gamma x^Tx' + r)^d clf = svm.SVC(kernel='rbf', C=1, gamma='scale') #acc=0.62, k<x, x'> = exp(-gamma ||x - x'||^2) #clf = svm.SVC(kernel='sigmoid', C=10) # acc=0.54, k<x, x'> = tanh (gamma x^Tx' + r) print(clf) #X, y = load_svmlight_file(train_data_path) X, y = load_label_data(data_path=train_data_path) clf.fit(X, y) end = time.time() last = end - start print("Train lasts %.2fs" % last) X, y_true = load_label_data(data_path=test_data_path) y_pred = clf.predict(X) print(classification_report(y_true, y_pred))
.shuffle(buffer_size=shuffle_buffer_size)\ .batch(batch_size=batch_size) def get_dataset(data_path=None, epochs=10, shuffle_buffer_size=1024, batch_size=16, word2id_dict=None): return dataset_generator(data_path=data_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, word2id_dict=word2id_dict) if __name__ == "__main__": data_dir = os.path.join(get_data_dir(), "text_classification") train_path = os.path.join(data_dir, "thucnews_train_vec.txt") # TODO word2id_dict = train_dataset = get_dataset(data_path=train_path, batch_size=4, word2id_dict=word2id_dict) # inputs: [None, 300] # label: [None, ] for i, (inputs, labels) in zip(range(2), train_dataset): print(i, inputs.shape, labels.shape)
def train_pretrained_fast_text(): total_num_train = 669589 # num_lines of thucnews_train_vec.txt total_num_val = 83316 # num_lines of thucnews_val_vec.txt num_classes = 14 epochs = 100 #epochs = 3 shuffle_buffer_size = 1024 * 2 batch_size = 32 patience = 10 # for early stopping data_dir = os.path.join(get_data_dir(), "text_classification") train_path = os.path.join(data_dir, "thucnews_train_vec.txt") val_path = os.path.join(data_dir, "thucnews_val_vec.txt") log_dir = os.path.join(get_log_dir(), "fast_text") checkpoint_path = os.path.join(get_model_dir(), "fast_text", "ckpt") history_path = os.path.join(get_log_dir(), "history", "fast_text.pkl") num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 # === tf.data.Dataset train_dataset = get_pretrained_dataset( data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size) val_dataset = get_pretrained_dataset( data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size) # === model model = PretrainedFastText(num_classes=num_classes) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) # loss loss = tf.keras.losses.SparseCategoricalCrossentropy() model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard(log_dir=log_dir) callbacks.append(tensorboard_cb) checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return history
fw.write(label + '\t' + seg_title + '\t' + seg_text + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print(line_cnt) if __name__ == '__main__': #file_type = "thucnews_train" #file_type = "thucnews_val" file_type = "thucnews_test" file_name = "shuf_" + file_type + ".txt" # use shuffled file to seg seg_file_name = file_type + "_seg.txt" file_path = os.path.join(get_data_dir(), "text_classification", file_name) seg_path = os.path.join(get_data_dir(), "text_classification", seg_file_name) #stopwords_path = os.path.join(get_data_dir(), "nlp", "baidu_stopwords.txt") stopwords_dict_path = os.path.join(get_data_dir(), "nlp", "stopwords_dict.pkl") # === Load stopwords_dict stopwords_dict = load_stopwords_dict( stopwords_dict_path=stopwords_dict_path) print("#stopwords_dict = %d" % len(stopwords_dict)) # === Segmentation segmentation_file(file_path=file_path, seg_path=seg_path,
continue vecs /= num_word vecs_str = ','.join(list(map(lambda x: str(x), vecs))) fw.write(label + '\t' + vecs_str + '\n') line_cnt += 1 if line_cnt % 1000 == 0: print(line_cnt) print("Total line %d" % line_cnt) if __name__ == '__main__': #file_type = "thucnews_train" #file_type = "thucnews_val" file_type = "thucnews_test" text_path = os.path.join(get_data_dir(), "text_classification", file_type + "_seg.txt") vec_path = os.path.join(get_data_dir(), "text_classification", file_type + "_vec.txt") word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl") # === Load word_vec_dict word_vec_dict = load_word_vector_dict( word_vector_dict_path=word_vector_dict_path) print("#word_vec_dict = %d" % len(word_vec_dict)) # === Compute vectors for file, merge title and text as ONE file. compute_vectors(text_path=text_path, vec_path=vec_path, word_vec_dict=word_vec_dict)
stopwords_dict = {} for stopwords_path in stopwords_path_list: with open(stopwords_path, 'r', encoding='utf-8') as fr: for line in fr: stopword = line[:-1] stopwords_dict[stopword] = True # English punctuation for char in string.punctuation: stopwords_dict[char] = True # Chinese punctuation for char in hanzi.punctuation: stopwords_dict[char] = True with open(stopwords_dict_path, 'wb') as fw: pickle.dump(stopwords_dict, fw) print("Write done! %s" % stopwords_dict_path) if __name__ == '__main__': baidu_stopwords_path = os.path.join(get_data_dir(), "nlp", "baidu_stopwords.txt") stopwords_dict_path = os.path.join(get_data_dir(), "nlp", "stopwords_dict.pkl") generate_stopwords_dict(stopwords_path_list=[baidu_stopwords_path], stopwords_dict_path=stopwords_dict_path)
def train_model(): total_num_train = 669589 # num_lines of thucnews_train_seg.txt total_num_val = 83316 # num_lines of thucnews_test_seg.txt max_seq_len = 350 # avg #words in sequence = 380, remove stop-words will go to ~350. filters = 100 kernel_size = 5 dense_units = 128 # 3 * filters ==> dense_units ==> 14 dropout_keep_ratio = 0.5 num_classes = 14 epochs = 100 #epochs = 3 shuffle_buffer_size = 1024 * 2 batch_size = 32 patience = 10 # for early stopping model_name = "pretrained_text_cnn" data_dir = os.path.join(get_data_dir(), "text_classification") train_path = os.path.join(data_dir, "thucnews_train_seg.txt") val_path = os.path.join(data_dir, "thucnews_test_seg.txt") log_dir = os.path.join(get_log_dir(), model_name) checkpoint_path = os.path.join(get_model_dir(), model_name, "ckpt") history_path = os.path.join(get_log_dir(), "history", model_name + ".pkl") word_vector_dict_path = os.path.join(get_model_dir(), "sogou_vectors.pkl") print('word_vector', word_vector_dict_path) # === Load word_vec_dict word_vec_dict = load_word_vector_dict( word_vector_dict_path=word_vector_dict_path) print("#word_vec_dict = %d" % len(word_vec_dict)) num_train_batch = total_num_train // batch_size + 1 num_val_batch = total_num_val // batch_size + 1 # === tf.data.Dataset train_dataset = get_dataset(data_path=train_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, max_seq_len=max_seq_len, word_vec_dict=word_vec_dict) val_dataset = get_dataset(data_path=val_path, epochs=epochs, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, max_seq_len=max_seq_len, word_vec_dict=word_vec_dict) # === model model = PretrainedTextCNN(num_classes=num_classes, filters=filters, kernel_size=kernel_size, dense_units=dense_units, dropout_keep_ratio=dropout_keep_ratio, max_seq_len=max_seq_len) # optimizer optimizer = tf.keras.optimizers.Adam(0.001) # loss loss = tf.keras.losses.SparseCategoricalCrossentropy() model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # callbacks callbacks = [] early_stopping_cb = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True) callbacks.append(early_stopping_cb) tensorboard_cb = TensorBoard(log_dir=log_dir) callbacks.append(tensorboard_cb) checkpoint_cb = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True) callbacks.append(checkpoint_cb) # === Train history = model.fit(train_dataset, epochs=epochs, steps_per_epoch=num_train_batch, validation_data=val_dataset, validation_steps=num_val_batch, callbacks=callbacks) print(model.summary()) return history
with open(text_path, 'r', encoding='utf-8') as f: for line in f: r = np.random.random() if r < train_ratio: fw_train.write(line) elif r >= train_ratio and r < train_ratio + val_ratio: fw_val.write(line) else: fw_test.write(line) fw_train.close() fw_val.close() fw_test.close() if __name__ == '__main__': text_path = os.path.join(get_data_dir(), "thucnews.txt") train_path = os.path.join(get_data_dir(), "thucnews_train.txt") val_path = os.path.join(get_data_dir(), "thucnews_val.txt") test_path = os.path.join(get_data_dir(), "thucnews_test.txt") train_ratio = 0.8 val_ratio = 0.1 split_train_val_test(text_path, train_path, val_path, test_path, train_ratio=train_ratio, val_ratio=val_ratio) print("Write done!", train_path)