def test_distributed_textset_integration(self): texts_rdd = self.sc.parallelize(self.texts) labels_rdd = self.sc.parallelize(self.labels) distributed_set = DistributedTextSet(texts_rdd, labels_rdd) assert distributed_set.is_distributed() assert not distributed_set.is_local() assert distributed_set.get_texts().collect() == self.texts assert distributed_set.get_labels().collect() == self.labels sets = distributed_set.random_split([0.5, 0.5]) train_texts = sets[0].get_texts().collect() test_texts = sets[1].get_texts().collect() assert set(train_texts + test_texts) == set(self.texts) tokenized = Tokenizer()(distributed_set) transformed = tokenized.normalize().word2idx().shape_sequence( 5).generate_sample() word_index = transformed.get_word_index() assert len(word_index) == 14 samples = transformed.get_samples().collect() assert len(samples) == 3 for sample in samples: assert sample.feature.shape[0] == 5 vocab_file = create_tmp_path() + ".txt" transformed.save_word_index(vocab_file) distributed_set2 = DistributedTextSet(texts_rdd, labels_rdd) distributed_set2.load_word_index(vocab_file) transformed2 = distributed_set2.tokenize().normalize().word2idx()\ .shape_sequence(5).generate_sample() samples2 = transformed2.get_samples().collect() for s1, s2 in zip(samples, samples2): assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray()) os.remove(vocab_file) model = TextClassifier(5, self.glove_path, word_index, 5, encoder="lstm") model.compile(SGD(), SparseCategoricalCrossEntropy()) model.fit(transformed, batch_size=2, nb_epoch=2) res_set = model.predict(transformed, batch_per_thread=2) predicts = res_set.get_predicts().collect() for predict in predicts: assert len(predict) == 1 assert predict[0].shape == (5, ) tmp_path = create_tmp_path() + ".bigdl" model.save_model(tmp_path, over_write=True) loaded_model = TextClassifier.load_model(tmp_path) loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2) loaded_predicts = loaded_res_set.get_predicts().collect() assert len(loaded_predicts) == len(predicts) os.remove(tmp_path)
def bigdl_estimator(): from zoo.orca.learn.bigdl.estimator import Estimator from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras.preprocessing import sequence from zoo.pipeline.api.keras.models import Model from zoo.pipeline.api.keras.objectives import SparseCategoricalCrossEntropy from zoo.orca.data import XShards from zoo.orca.learn.metrics import Accuracy import numpy as np # conf = {"spark.executor.extraJavaOptions": "-Xss512m", "spark.driver.extraJavaOptions": "-Xss512m"} # init_orca_context(cluster_mode="local", cores=8, memory="16g") init_orca_context(cluster_mode="local", cores=4, memory="16g") max_features = 200 max_len = 20 print("running bigdl estimator") (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) x_train = x_train[:1000] y_train = y_train[:1000] x_test = x_test[-1000:] y_test = y_test[-1000:] print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) train_pos = np.zeros((len(x_train), max_len), dtype=np.int32) val_pos = np.zeros((len(x_test), max_len), dtype=np.int32) for i in range(0, len(x_train)): train_pos[i, :] = np.arange(max_len) val_pos[i, :] = np.arange(max_len) train_dataset = XShards.partition({"x": (x_train, train_pos), "y": np.array(y_train)}) val_dataset = XShards.partition({"x": (x_test, val_pos), "y": np.array(y_test)}) token_shape = (max_len,) position_shape = (max_len,) token_input = Input(shape=token_shape) position_input = Input(shape=position_shape) O_seq = TransformerLayer.init(vocab=max_features, hidden_size=128, n_head=8, seq_len=max_len)([token_input, position_input]) # Select the first output of the Transformer. The second is the pooled output. O_seq = SelectTable(0)(O_seq) O_seq = GlobalAveragePooling1D()(O_seq) O_seq = Dropout(0.2)(O_seq) outputs = Dense(2, activation='softmax')(O_seq) model = Model([token_input, position_input], outputs) model.summary() batch_size = 64 print("Train started") est = Estimator.from_bigdl(model=model, loss=SparseCategoricalCrossEntropy(), optimizer=Adam(), metrics=[Accuracy()]) est.set_constant_gradient_clipping(0.1, 0.2) est.fit(data=train_dataset, batch_size=batch_size, epochs=1) result = est.evaluate(val_dataset) print(result) est.clear_gradient_clipping() est.set_l2_norm_gradient_clipping(0.5) est.fit(data=train_dataset, batch_size=batch_size, epochs=1) print("Train finished") print("Evaluating started") result = est.evaluate(val_dataset) print(result) print("Evaluating finished") est.save('work/saved_model') # est.load('work/saved_model') print("load and save API finished") est.get_train_summary(tag='Loss') est.get_validation_summary(tag='Top1Accuracy') print("get summary API finished") stop_orca_context()
sample_rdd = vector_rdd.map(lambda vectors_label: to_sample( vectors_label[0], vectors_label[1], token_length)) train_rdd, val_rdd = sample_rdd.randomSplit( [training_split, 1 - training_split]) if options.model: model = TextClassifier.load_model(options.model) else: model = TextClassifier(CLASS_NUM, token_length, sequence_len, options.encoder, int(options.encoder_output_dim)) optimizer = Optimizer(model=model, training_rdd=train_rdd, criterion=SparseCategoricalCrossEntropy(), end_trigger=MaxEpoch(int(options.nb_epoch)), batch_size=batch_size, optim_method=Adagrad(learningrate=float( options.learning_rate), learningrate_decay=0.001)) optimizer.set_validation(batch_size=batch_size, val_rdd=val_rdd, trigger=EveryEpoch(), val_method=[Accuracy()]) log_dir = options.log_dir app_name = 'adam-' + dt.datetime.now().strftime("%Y%m%d-%H%M%S") train_summary = TrainSummary(log_dir=log_dir, app_name=app_name) train_summary.set_summary_trigger("Parameters", SeveralIteration(50)) val_summary = ValidationSummary(log_dir=log_dir, app_name=app_name)
token_input = Input(shape=token_shape) position_input = Input(shape=position_shape) O_seq = TransformerLayer.init(vocab=max_features, hidden_size=128, n_head=8, seq_len=max_len)([token_input, position_input]) # Select the first output of the Transformer. The second is the pooled output. O_seq = SelectTable(0)(O_seq) O_seq = GlobalAveragePooling1D()(O_seq) O_seq = Dropout(0.2)(O_seq) outputs = Dense(2, activation='softmax')(O_seq) model = Model([token_input, position_input], outputs) model.summary() batch_size = 128 print('Train...') est = Estimator.from_bigdl(model=model, loss=SparseCategoricalCrossEntropy(), optimizer=Adam(), metrics=[Accuracy()]) est.fit(data=train_dataset, batch_size=batch_size, epochs=1) print("Train finished.") print('Evaluating...') result = est.evaluate(val_dataset) print(result) print("finished...") stop_orca_context()