Пример #1
0
 def test_forward_backward(self):
     resource_path = os.path.join(
         os.path.split(__file__)[0], "../../resources")
     glove_path = os.path.join(resource_path, "glove.6B/glove.6B.50d.txt")
     model = TextClassifier(10, glove_path)
     model.summary()
     input_data = np.random.randint(20, size=(4, 500))
     self.assert_forward_backward(model, input_data)
Пример #2
0
    def test_distributed_textset_integration(self):
        texts_rdd = self.sc.parallelize(self.texts)
        labels_rdd = self.sc.parallelize(self.labels)
        distributed_set = DistributedTextSet(texts_rdd, labels_rdd)
        assert distributed_set.is_distributed()
        assert not distributed_set.is_local()
        assert distributed_set.get_texts().collect() == self.texts
        assert distributed_set.get_labels().collect() == self.labels

        sets = distributed_set.random_split([0.5, 0.5])
        train_texts = sets[0].get_texts().collect()
        test_texts = sets[1].get_texts().collect()
        assert set(train_texts + test_texts) == set(self.texts)

        tokenized = Tokenizer()(distributed_set)
        transformed = tokenized.normalize().word2idx().shape_sequence(
            5).generate_sample()
        word_index = transformed.get_word_index()
        assert len(word_index) == 14
        samples = transformed.get_samples().collect()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 5

        vocab_file = create_tmp_path() + ".txt"
        transformed.save_word_index(vocab_file)
        distributed_set2 = DistributedTextSet(texts_rdd, labels_rdd)
        distributed_set2.load_word_index(vocab_file)
        transformed2 = distributed_set2.tokenize().normalize().word2idx()\
            .shape_sequence(5).generate_sample()
        samples2 = transformed2.get_samples().collect()
        for s1, s2 in zip(samples, samples2):
            assert np.allclose(s1.feature.to_ndarray(),
                               s2.feature.to_ndarray())
        os.remove(vocab_file)

        model = TextClassifier(5,
                               self.glove_path,
                               word_index,
                               5,
                               encoder="lstm")
        model.compile(SGD(), SparseCategoricalCrossEntropy())
        model.fit(transformed, batch_size=2, nb_epoch=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts().collect()
        for predict in predicts:
            assert len(predict) == 1
            assert predict[0].shape == (5, )

        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts().collect()
        assert len(loaded_predicts) == len(predicts)
        os.remove(tmp_path)
Пример #3
0
 def test_forward_backward(self):
     model = TextClassifier(10, glove_path)
     model.summary()
     input_data = np.random.randint(20, size=(4, 500))
     self.assert_forward_backward(model, input_data)
     model.set_evaluate_status()
     # Forward twice will get the same output
     output1 = model.forward(input_data)
     output2 = model.forward(input_data)
     assert np.allclose(output1, output2)
Пример #4
0
    parser.add_option("-m", "--model", dest="model")

    (options, args) = parser.parse_args(sys.argv)
    sc = init_nncontext("Text Classification Example")

    text_set = TextSet.read(path=options.data_path).to_distributed(
        sc, int(options.partition_num))
    print("Processing text dataset...")
    transformed = text_set.tokenize().normalize()\
        .word2idx(remove_topN=10, max_words_num=int(options.max_words_num))\
        .shape_sequence(len=int(options.sequence_length)).generate_sample()
    train_set, val_set = transformed.random_split(
        [float(options.training_split), 1 - float(options.training_split)])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        token_length = int(options.token_length)
        if not (token_length == 50 or token_length == 100
                or token_length == 200 or token_length == 300):
            raise ValueError(
                'token_length for GloVe can only be 50, 100, 200, 300, but got '
                + str(token_length))
        embedding_file = options.embedding_path + "/glove.6B." + str(
            token_length) + "d.txt"
        word_index = transformed.get_word_index()
        model = TextClassifier(int(options.class_num),
                               embedding_file, word_index,
                               int(options.sequence_length), options.encoder,
                               int(options.encoder_output_dim))
    parser.add_option("--port", dest="port", default="9999")
    parser.add_option("--index_path", dest="index_path")
    parser.add_option("--partition_num", dest="partition_num", default="4")
    parser.add_option("--sequence_length",
                      dest="sequence_length",
                      default="500")
    parser.add_option("-b", "--batch_size", dest="batch_size", default="128")
    parser.add_option("-m", "--model", dest="model")

    (options, args) = parser.parse_args(sys.argv)

    sc = init_nncontext("Text Classification Example")
    ssc = StreamingContext(sc, 3)
    lines = ssc.socketTextStream(options.host, int(options.port))

    model = TextClassifier.load_model(options.model)

    def predict(record):
        if record.getNumPartitions() == 0:
            return
        text_set = DistributedTextSet(record)
        text_set.load_word_index(options.index_path)
        print("Processing text...")
        transformed = text_set.tokenize().normalize()\
            .word2idx()\
            .shape_sequence(len=int(options.sequence_length)).generate_sample()
        predict_set = model.predict(transformed, int(options.partition_num))
        # Get the first five prediction probability distributions
        predicts = predict_set.get_predicts().take(5)
        print("Probability distributions of top-5 texts:")
        for p in predicts:
Пример #6
0
 def test_save_load(self):
     model = TextClassifier(20, 200)
     input_data = np.random.random([2, 500, 200])
     self.assert_zoo_model_save_load(model, input_data)
Пример #7
0
 def test_forward_backward(self):
     model = TextClassifier(10, 30, 100)
     model.summary()
     input_data = np.random.random([3, 100, 30])
     self.assert_forward_backward(model, input_data)
Пример #8
0
    def test_local_textset_integration(self):
        local_set = LocalTextSet(self.texts, self.labels)
        assert local_set.is_local()
        assert not local_set.is_distributed()
        assert local_set.get_texts() == self.texts
        assert local_set.get_labels() == self.labels
        tokenized = ChainedPreprocessing([Tokenizer(),
                                          Normalizer()])(local_set)
        word_index = tokenized.generate_word_index_map(max_words_num=10)
        transformed = ChainedPreprocessing([
            WordIndexer(word_index),
            SequenceShaper(10),
            TextFeatureToSample()
        ])(tokenized)
        assert transformed.is_local()
        word_index = transformed.get_word_index()
        assert len(word_index) == 10
        assert word_index["my"] == 1
        samples = transformed.get_samples()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 10

        model = TextClassifier(5, self.glove_path, word_index, 10)
        model.compile("adagrad", "sparse_categorical_crossentropy",
                      ['accuracy'])
        tmp_log_dir = create_tmp_path()
        tmp_checkpoint_path = create_tmp_path()
        os.mkdir(tmp_checkpoint_path)
        model.set_tensorboard(tmp_log_dir, "textclassification")
        model.set_checkpoint(tmp_checkpoint_path)
        model.fit(transformed,
                  batch_size=2,
                  nb_epoch=2,
                  validation_data=transformed)
        acc = model.evaluate(transformed, batch_size=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts()

        # Test for loaded model predict on TextSet
        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts()
        assert len(predicts) == len(loaded_predicts)

        for i in range(0, len(predicts)):
            assert len(predicts[i]) == 1
            assert len(loaded_predicts[i]) == 1
            assert predicts[i][0].shape == (5, )
            assert np.allclose(predicts[i][0], loaded_predicts[i][0])
        shutil.rmtree(tmp_log_dir)
        shutil.rmtree(tmp_checkpoint_path)
        os.remove(tmp_path)
Пример #9
0
    tokens_rdd = text_data_rdd.map(lambda text_label:
                                   ([w for w in text_to_words(text_label[0]) if
                                     w in word_mata_broadcast.value], text_label[1]))
    padded_tokens_rdd = tokens_rdd.map(lambda tokens_label:
                                       (pad(tokens_label[0], "##", sequence_len), tokens_label[1]))
    vector_rdd = padded_tokens_rdd.map(lambda tokens_label:
                                       ([to_vec(w, filtered_word2vec_broadcast.value, token_length)
                                         for w in tokens_label[0]], tokens_label[1]))
    sample_rdd = vector_rdd.map(
        lambda vectors_label: to_sample(vectors_label[0], vectors_label[1], token_length))

    train_rdd, val_rdd = sample_rdd.randomSplit([training_split, 1-training_split])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        model = TextClassifier(CLASS_NUM, token_length, sequence_len,
                               options.encoder, int(options.encoder_output_dim))

    optimizer = Optimizer(
        model=model,
        training_rdd=train_rdd,
        criterion=ClassNLLCriterion(logProbAsInput=False),
        end_trigger=MaxEpoch(int(options.nb_epoch)),
        batch_size=batch_size,
        optim_method=Adagrad(learningrate=float(options.learning_rate), learningrate_decay=0.001))
    optimizer.set_validation(
        batch_size=batch_size,
        val_rdd=val_rdd,
        trigger=EveryEpoch(),
Пример #10
0
        if w in word_mata_broadcast.value
    ], text_label[1]))
    padded_tokens_rdd = tokens_rdd.map(lambda tokens_label: (pad(
        tokens_label[0], "##", sequence_len), tokens_label[1]))
    vector_rdd = padded_tokens_rdd.map(lambda tokens_label: ([
        to_vec(w, filtered_word2vec_broadcast.value, token_length)
        for w in tokens_label[0]
    ], tokens_label[1]))
    sample_rdd = vector_rdd.map(lambda vectors_label: to_sample(
        vectors_label[0], vectors_label[1], token_length))

    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1 - training_split])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        model = TextClassifier(CLASS_NUM, token_length,
                               sequence_len, options.encoder,
                               int(options.encoder_output_dim))

    optimizer = Optimizer(model=model,
                          training_rdd=train_rdd,
                          criterion=ClassNLLCriterion(logProbAsInput=False),
                          end_trigger=MaxEpoch(int(options.nb_epoch)),
                          batch_size=batch_size,
                          optim_method=Adagrad(learningrate=float(
                              options.learning_rate),
                                               learningrate_decay=0.001))
    optimizer.set_validation(batch_size=batch_size,
                             val_rdd=val_rdd,
Пример #11
0
    word_meta = dict(word_meta[10:max_words_num])
    word_mata_broadcast = sc.broadcast(word_meta)

    indexed_rdd = text_data_rdd\
        .map(lambda text_label:
             ([word_mata_broadcast.value[w][0] for w in text_to_words(text_label[0]) if
               w in word_mata_broadcast.value], text_label[1]))\
        .map(lambda tokens_label:
             (pad(tokens_label[0], 0, sequence_len), tokens_label[1]))
    sample_rdd = indexed_rdd.map(lambda features_label: Sample.from_ndarray(
        np.array(features_label[0]), features_label[1]))
    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1 - training_split])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        if not (token_length == 50 or token_length == 100
                or token_length == 200 or token_length == 300):
            raise ValueError(
                'token_length for GloVe can only be 50, 100, 200, 300, but got '
                + str(token_length))
        embedding_file = data_path + "/glove.6B/glove.6B." + str(
            token_length) + "d.txt"
        word_index = {w: i_f[0] for w, i_f in word_meta.items()}
        model = TextClassifier(class_num, embedding_file, word_index,
                               sequence_len, options.encoder,
                               int(options.encoder_output_dim))

    optimizer = Optimizer(model=model,
                          training_rdd=train_rdd,
Пример #12
0
 def test_save_load(self):
     model = TextClassifier(20, glove_path, sequence_length=100)
     input_data = np.random.randint(20, size=(3, 100))
     self.assert_zoo_model_save_load(model, input_data)
Пример #13
0
 def test_forward_backward(self):
     model = TextClassifier(10, glove_path)
     model.summary()
     input_data = np.random.randint(20, size=(4, 500))
     self.assert_forward_backward(model, input_data)