def test_forward_backward(self): resource_path = os.path.join( os.path.split(__file__)[0], "../../resources") glove_path = os.path.join(resource_path, "glove.6B/glove.6B.50d.txt") model = TextClassifier(10, glove_path) model.summary() input_data = np.random.randint(20, size=(4, 500)) self.assert_forward_backward(model, input_data)
def test_distributed_textset_integration(self): texts_rdd = self.sc.parallelize(self.texts) labels_rdd = self.sc.parallelize(self.labels) distributed_set = DistributedTextSet(texts_rdd, labels_rdd) assert distributed_set.is_distributed() assert not distributed_set.is_local() assert distributed_set.get_texts().collect() == self.texts assert distributed_set.get_labels().collect() == self.labels sets = distributed_set.random_split([0.5, 0.5]) train_texts = sets[0].get_texts().collect() test_texts = sets[1].get_texts().collect() assert set(train_texts + test_texts) == set(self.texts) tokenized = Tokenizer()(distributed_set) transformed = tokenized.normalize().word2idx().shape_sequence( 5).generate_sample() word_index = transformed.get_word_index() assert len(word_index) == 14 samples = transformed.get_samples().collect() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 5 vocab_file = create_tmp_path() + ".txt" transformed.save_word_index(vocab_file) distributed_set2 = DistributedTextSet(texts_rdd, labels_rdd) distributed_set2.load_word_index(vocab_file) transformed2 = distributed_set2.tokenize().normalize().word2idx()\ .shape_sequence(5).generate_sample() samples2 = transformed2.get_samples().collect() for s1, s2 in zip(samples, samples2): assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray()) os.remove(vocab_file) model = TextClassifier(5, self.glove_path, word_index, 5, encoder="lstm") model.compile(SGD(), SparseCategoricalCrossEntropy()) model.fit(transformed, batch_size=2, nb_epoch=2) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts().collect() for predict in predicts: assert len(predict) == 1 assert predict[0].shape == (5, ) tmp_path = create_tmp_path() + ".bigdl" model.save_model(tmp_path, over_write=True) loaded_model = TextClassifier.load_model(tmp_path) loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2) loaded_predicts = loaded_res_set.get_predicts().collect() assert len(loaded_predicts) == len(predicts) os.remove(tmp_path)
def test_forward_backward(self): model = TextClassifier(10, glove_path) model.summary() input_data = np.random.randint(20, size=(4, 500)) self.assert_forward_backward(model, input_data) model.set_evaluate_status() # Forward twice will get the same output output1 = model.forward(input_data) output2 = model.forward(input_data) assert np.allclose(output1, output2)
parser.add_option("-m", "--model", dest="model") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("Text Classification Example") text_set = TextSet.read(path=options.data_path).to_distributed( sc, int(options.partition_num)) print("Processing text dataset...") transformed = text_set.tokenize().normalize()\ .word2idx(remove_topN=10, max_words_num=int(options.max_words_num))\ .shape_sequence(len=int(options.sequence_length)).generate_sample() train_set, val_set = transformed.random_split( [float(options.training_split), 1 - float(options.training_split)]) if options.model: model = TextClassifier.load_model(options.model) else: token_length = int(options.token_length) if not (token_length == 50 or token_length == 100 or token_length == 200 or token_length == 300): raise ValueError( 'token_length for GloVe can only be 50, 100, 200, 300, but got ' + str(token_length)) embedding_file = options.embedding_path + "/glove.6B." + str( token_length) + "d.txt" word_index = transformed.get_word_index() model = TextClassifier(int(options.class_num), embedding_file, word_index, int(options.sequence_length), options.encoder, int(options.encoder_output_dim))
parser.add_option("--port", dest="port", default="9999") parser.add_option("--index_path", dest="index_path") parser.add_option("--partition_num", dest="partition_num", default="4") parser.add_option("--sequence_length", dest="sequence_length", default="500") parser.add_option("-b", "--batch_size", dest="batch_size", default="128") parser.add_option("-m", "--model", dest="model") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("Text Classification Example") ssc = StreamingContext(sc, 3) lines = ssc.socketTextStream(options.host, int(options.port)) model = TextClassifier.load_model(options.model) def predict(record): if record.getNumPartitions() == 0: return text_set = DistributedTextSet(record) text_set.load_word_index(options.index_path) print("Processing text...") transformed = text_set.tokenize().normalize()\ .word2idx()\ .shape_sequence(len=int(options.sequence_length)).generate_sample() predict_set = model.predict(transformed, int(options.partition_num)) # Get the first five prediction probability distributions predicts = predict_set.get_predicts().take(5) print("Probability distributions of top-5 texts:") for p in predicts:
def test_save_load(self): model = TextClassifier(20, 200) input_data = np.random.random([2, 500, 200]) self.assert_zoo_model_save_load(model, input_data)
def test_forward_backward(self): model = TextClassifier(10, 30, 100) model.summary() input_data = np.random.random([3, 100, 30]) self.assert_forward_backward(model, input_data)
def test_local_textset_integration(self): local_set = LocalTextSet(self.texts, self.labels) assert local_set.is_local() assert not local_set.is_distributed() assert local_set.get_texts() == self.texts assert local_set.get_labels() == self.labels tokenized = ChainedPreprocessing([Tokenizer(), Normalizer()])(local_set) word_index = tokenized.generate_word_index_map(max_words_num=10) transformed = ChainedPreprocessing([ WordIndexer(word_index), SequenceShaper(10), TextFeatureToSample() ])(tokenized) assert transformed.is_local() word_index = transformed.get_word_index() assert len(word_index) == 10 assert word_index["my"] == 1 samples = transformed.get_samples() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 10 model = TextClassifier(5, self.glove_path, word_index, 10) model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy']) tmp_log_dir = create_tmp_path() tmp_checkpoint_path = create_tmp_path() os.mkdir(tmp_checkpoint_path) model.set_tensorboard(tmp_log_dir, "textclassification") model.set_checkpoint(tmp_checkpoint_path) model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed) acc = model.evaluate(transformed, batch_size=2) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts() # Test for loaded model predict on TextSet tmp_path = create_tmp_path() + ".bigdl" model.save_model(tmp_path, over_write=True) loaded_model = TextClassifier.load_model(tmp_path) loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2) loaded_predicts = loaded_res_set.get_predicts() assert len(predicts) == len(loaded_predicts) for i in range(0, len(predicts)): assert len(predicts[i]) == 1 assert len(loaded_predicts[i]) == 1 assert predicts[i][0].shape == (5, ) assert np.allclose(predicts[i][0], loaded_predicts[i][0]) shutil.rmtree(tmp_log_dir) shutil.rmtree(tmp_checkpoint_path) os.remove(tmp_path)
tokens_rdd = text_data_rdd.map(lambda text_label: ([w for w in text_to_words(text_label[0]) if w in word_mata_broadcast.value], text_label[1])) padded_tokens_rdd = tokens_rdd.map(lambda tokens_label: (pad(tokens_label[0], "##", sequence_len), tokens_label[1])) vector_rdd = padded_tokens_rdd.map(lambda tokens_label: ([to_vec(w, filtered_word2vec_broadcast.value, token_length) for w in tokens_label[0]], tokens_label[1])) sample_rdd = vector_rdd.map( lambda vectors_label: to_sample(vectors_label[0], vectors_label[1], token_length)) train_rdd, val_rdd = sample_rdd.randomSplit([training_split, 1-training_split]) if options.model: model = TextClassifier.load_model(options.model) else: model = TextClassifier(CLASS_NUM, token_length, sequence_len, options.encoder, int(options.encoder_output_dim)) optimizer = Optimizer( model=model, training_rdd=train_rdd, criterion=ClassNLLCriterion(logProbAsInput=False), end_trigger=MaxEpoch(int(options.nb_epoch)), batch_size=batch_size, optim_method=Adagrad(learningrate=float(options.learning_rate), learningrate_decay=0.001)) optimizer.set_validation( batch_size=batch_size, val_rdd=val_rdd, trigger=EveryEpoch(),
if w in word_mata_broadcast.value ], text_label[1])) padded_tokens_rdd = tokens_rdd.map(lambda tokens_label: (pad( tokens_label[0], "##", sequence_len), tokens_label[1])) vector_rdd = padded_tokens_rdd.map(lambda tokens_label: ([ to_vec(w, filtered_word2vec_broadcast.value, token_length) for w in tokens_label[0] ], tokens_label[1])) sample_rdd = vector_rdd.map(lambda vectors_label: to_sample( vectors_label[0], vectors_label[1], token_length)) train_rdd, val_rdd = sample_rdd.randomSplit( [training_split, 1 - training_split]) if options.model: model = TextClassifier.load_model(options.model) else: model = TextClassifier(CLASS_NUM, token_length, sequence_len, options.encoder, int(options.encoder_output_dim)) optimizer = Optimizer(model=model, training_rdd=train_rdd, criterion=ClassNLLCriterion(logProbAsInput=False), end_trigger=MaxEpoch(int(options.nb_epoch)), batch_size=batch_size, optim_method=Adagrad(learningrate=float( options.learning_rate), learningrate_decay=0.001)) optimizer.set_validation(batch_size=batch_size, val_rdd=val_rdd,
word_meta = dict(word_meta[10:max_words_num]) word_mata_broadcast = sc.broadcast(word_meta) indexed_rdd = text_data_rdd\ .map(lambda text_label: ([word_mata_broadcast.value[w][0] for w in text_to_words(text_label[0]) if w in word_mata_broadcast.value], text_label[1]))\ .map(lambda tokens_label: (pad(tokens_label[0], 0, sequence_len), tokens_label[1])) sample_rdd = indexed_rdd.map(lambda features_label: Sample.from_ndarray( np.array(features_label[0]), features_label[1])) train_rdd, val_rdd = sample_rdd.randomSplit( [training_split, 1 - training_split]) if options.model: model = TextClassifier.load_model(options.model) else: if not (token_length == 50 or token_length == 100 or token_length == 200 or token_length == 300): raise ValueError( 'token_length for GloVe can only be 50, 100, 200, 300, but got ' + str(token_length)) embedding_file = data_path + "/glove.6B/glove.6B." + str( token_length) + "d.txt" word_index = {w: i_f[0] for w, i_f in word_meta.items()} model = TextClassifier(class_num, embedding_file, word_index, sequence_len, options.encoder, int(options.encoder_output_dim)) optimizer = Optimizer(model=model, training_rdd=train_rdd,
def test_save_load(self): model = TextClassifier(20, glove_path, sequence_length=100) input_data = np.random.randint(20, size=(3, 100)) self.assert_zoo_model_save_load(model, input_data)
def test_forward_backward(self): model = TextClassifier(10, glove_path) model.summary() input_data = np.random.randint(20, size=(4, 500)) self.assert_forward_backward(model, input_data)