Пример #1
0
def test_tensorflow_predictor_no_training():
    model = build_model()
    checkpoint = to_air_checkpoint(model)
    batch_predictor = BatchPredictor.from_checkpoint(
        checkpoint, TensorflowPredictor, model_definition=build_model)
    predict_dataset = ray.data.range(3)
    predictions = batch_predictor.predict(predict_dataset)
    assert predictions.count() == 3
Пример #2
0
def test_sklearn_predictor_no_training():
    with tempfile.TemporaryDirectory() as tmpdir:
        checkpoint = to_air_checkpoint(path=tmpdir, estimator=model)
        batch_predictor = BatchPredictor.from_checkpoint(
            checkpoint, SklearnPredictor)
        test_dataset = ray.data.from_pandas(
            pd.DataFrame(dummy_data, columns=["A", "B"]))
        predictions = batch_predictor.predict(test_dataset)
        assert len(predictions.to_pandas()) == 3
Пример #3
0
def predict_linear(result: Result):
    batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint,
                                                     TorchPredictor)

    items = [{"x": random.uniform(0, 1) for _ in range(10)}]
    prediction_dataset = ray.data.from_items(items)

    predictions = batch_predictor.predict(prediction_dataset,
                                          dtype=torch.float)

    return predictions
Пример #4
0
def test_batch_prediction():
    batch_predictor = BatchPredictor.from_checkpoint(
        Checkpoint.from_dict({"factor": 2.0}), DummyPredictor)

    test_dataset = ray.data.from_items([1.0, 2.0, 3.0, 4.0])
    assert batch_predictor.predict(
        test_dataset).to_pandas().to_numpy().squeeze().tolist() == [
            4.0,
            8.0,
            12.0,
            16.0,
        ]
Пример #5
0
def test_batch_prediction_fs():
    batch_predictor = BatchPredictor.from_checkpoint(
        Checkpoint.from_dict({"factor": 2.0}), DummyPredictorFS)

    test_dataset = ray.data.from_items([1.0, 2.0, 3.0, 4.0] *
                                       32).repartition(8)
    assert (batch_predictor.predict(
        test_dataset,
        min_scoring_workers=4).to_pandas().to_numpy().squeeze().tolist() == [
            4.0,
            8.0,
            12.0,
            16.0,
        ] * 32)
Пример #6
0
def predict_linear(result: Result) -> Dataset:
    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint, TensorflowPredictor, model_definition=build_model)

    items = [{"x": np.random.uniform(0, 1)} for _ in range(10)]
    prediction_dataset = ray.data.from_items(items)

    predictions = batch_predictor.predict(prediction_dataset, dtype=tf.float32)

    pandas_predictions = predictions.to_pandas(float("inf"))

    print(f"PREDICTIONS\n{pandas_predictions}")

    return predictions
Пример #7
0
def predict_tensorflow_mnist(result: Result) -> ray.data.Dataset:
    test_dataset = get_dataset(split_type="test")
    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint,
        TensorflowPredictor,
        model_definition=build_autoencoder_model)

    predictions = batch_predictor.predict(test_dataset,
                                          feature_columns=["image"],
                                          dtype=tf.float32)

    pandas_predictions = predictions.to_pandas(float("inf"))
    print(f"PREDICTIONS\n{pandas_predictions}")

    return pandas_predictions
Пример #8
0
def test_batch_prediction_with_set_cpus(ray_start_4_cpus):
    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)

        checkpoint = Checkpoint.from_directory(tmpdir)

        batch_predictor = BatchPredictor.from_checkpoint(
            checkpoint, SklearnPredictor)

        test_dataset = ray.data.from_pandas(
            pd.DataFrame(dummy_data, columns=["A", "B"]))
        batch_predictor.predict(test_dataset,
                                num_cpus_per_worker=2,
                                num_estimator_cpus=2)
Пример #9
0
def test_e2e(ray_start_4_cpus, save_strategy):
    ray_train = ray.data.from_pandas(train_df)
    ray_validation = ray.data.from_pandas(validation_df)
    scaling_config = {"num_workers": 2, "use_gpu": False}
    trainer = HuggingFaceTrainer(
        trainer_init_per_worker=train_function,
        trainer_init_config={"epochs": 4, "save_strategy": save_strategy},
        scaling_config=scaling_config,
        datasets={"train": ray_train, "evaluation": ray_validation},
    )
    result = trainer.fit()

    assert result.metrics["epoch"] == 4
    assert result.metrics["training_iteration"] == 4
    assert result.checkpoint

    trainer2 = HuggingFaceTrainer(
        trainer_init_per_worker=train_function,
        trainer_init_config={"epochs": 5},  # this will train for 1 epoch: 5 - 4 = 1
        scaling_config=scaling_config,
        datasets={"train": ray_train, "evaluation": ray_validation},
        resume_from_checkpoint=result.checkpoint,
    )
    result2 = trainer2.fit()

    assert result2.metrics["epoch"] == 5
    assert result2.metrics["training_iteration"] == 1
    assert result2.checkpoint

    predictor = BatchPredictor.from_checkpoint(
        result2.checkpoint,
        HuggingFacePredictor,
        task="text-generation",
        tokenizer=AutoTokenizer.from_pretrained(tokenizer_checkpoint),
    )

    predictions = predictor.predict(ray.data.from_pandas(prompts))
    assert predictions.count() == 3
Пример #10
0
def visualize_tensorflow_mnist_autoencoder(result: Result) -> None:
    test_dataset = get_dataset(split_type="test")
    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint,
        TensorflowPredictor,
        model_definition=build_autoencoder_model)

    # test_dataset.
    predictions = batch_predictor.predict(test_dataset,
                                          feature_columns=["image"],
                                          dtype=tf.float32)

    pandas_predictions = predictions.to_pandas(float("inf"))

    decoded_imgs = pandas_predictions["predictions"].values
    x_test = test_dataset.to_pandas(float("inf"))["image"].values

    import matplotlib.pyplot as plt

    n = 10  # How many digits we will display
    plt.figure(figsize=(20, 4))
    for i in range(n):
        # Display original
        ax = plt.subplot(2, n, i + 1)
        plt.imshow(np.asarray(x_test[i]).reshape(28, 28))
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

        # Display reconstruction
        ax = plt.subplot(2, n, i + 1 + n)
        plt.imshow(np.asarray(decoded_imgs[i]).reshape(28, 28))
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

    # how to retrieve the folderpath of the checkpoint
    plt.savefig("test.png")
Пример #11
0
# flake8: noqa

# __use_pretrained_model_start__
import ray
import tensorflow as tf
from ray.air.batch_predictor import BatchPredictor
from ray.air.predictors.integrations.tensorflow import (
    to_air_checkpoint,
    TensorflowPredictor,
)


# to simulate having a pretrained model.
def build_model() -> tf.keras.Model:
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1, )),
        tf.keras.layers.Dense(1),
    ])
    return model


model = build_model()
checkpoint = to_air_checkpoint(model)
batch_predictor = BatchPredictor(checkpoint,
                                 TensorflowPredictor,
                                 model_definition=build_model)
predict_dataset = ray.data.range(3)
predictions = batch_predictor.predict(predict_dataset)

# __use_pretrained_model_end__
Пример #12
0
def main(
    model_checkpoint="gpt2",
    tokenizer_checkpoint="sgugger/gpt2-like-tokenizer",
    dataset_name="wikitext-2-raw-v1",
    dataset_path="wikitext",
    num_epochs=5,
    num_workers=2,
    use_gpu=False,
    smoke_test=False,
):
    block_size = 128

    # Uncomment the following if the maximum length the model was
    # pretrained with can fit in your memory.
    # block_size = tokenizer.model_max_length

    # Run this as a remote function to avoid downloading on the driver
    @ray.remote
    def get_dataset():
        datasets = load_dataset(dataset_path, dataset_name)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

        def tokenize_function(examples):
            return tokenizer(examples["text"])

        tokenized_datasets = datasets.map(tokenize_function,
                                          batched=True,
                                          num_proc=1,
                                          remove_columns=["text"])

        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder. We could add padding if the model supported
            # it instead of this drop. You can customize this part to your needs.
            total_length = (total_length // block_size) * block_size
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + block_size]
                    for i in range(0, total_length, block_size)
                ]
                for k, t in concatenated_examples.items()
            }
            result["labels"] = result["input_ids"].copy()
            return result

        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            batch_size=1000,
            num_proc=1,
        )
        ray_train = ray.data.from_huggingface(lm_datasets["train"])
        ray_validation = ray.data.from_huggingface(lm_datasets["validation"])
        return ray_train, ray_validation

    ray_train, ray_validation = ray.get(get_dataset.remote())

    def train_function(train_dataset, eval_dataset=None, **config):
        model_config = AutoConfig.from_pretrained(model_checkpoint)
        model = AutoModelForCausalLM.from_config(model_config)
        print("Initializing TrainingArguments...")
        # The checkpoints will be moved to Ray Tune results
        # directory automatically
        training_dir = tempfile.mkdtemp()
        training_args = TrainingArguments(
            training_dir,
            evaluation_strategy="epoch",
            num_train_epochs=num_epochs,
            learning_rate=2e-5,
            weight_decay=0.01,
            disable_tqdm=True,
            save_strategy="epoch",
            # Required to avoid an exception
            no_cuda=not torch.cuda.is_available(),
        )
        print("Initializing Trainer...")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )
        print("Trainer initialized! Starting training...")
        return trainer

    if smoke_test:
        ray_train = ray_train.limit(16)
        ray_validation = ray_validation.limit(8)

    trainer = HuggingFaceTrainer(
        trainer_init_per_worker=train_function,
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu
        },
        datasets={
            "train": ray_train,
            "evaluation": ray_validation
        },
    )
    results = trainer.fit()
    print(results.metrics)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
    prompt = ["My text: Complete me..."]
    predictor = BatchPredictor.from_checkpoint(
        results.checkpoint,
        HuggingFacePredictor,
        task="text-generation",
        tokenizer=tokenizer,
    )
    data = ray.data.from_pandas(pd.DataFrame(prompt, columns=["prompt"]))
    prediction = predictor.predict(data, num_gpus_per_worker=int(use_gpu))
    prediction = prediction.to_pandas().iloc[0]["generated_text"]

    print(f"Generated text for prompt '{prompt}': '{prediction}'")
Пример #13
0
    params=params,
    datasets={
        "train": train_dataset,
        "valid": valid_dataset
    },
    preprocessor=preprocessor,
    num_boost_round=20,
)
result = trainer.fit()
print(result.metrics)
# __air_xgb_train_end__

# __air_xgb_batchpred_start__
from ray.air.batch_predictor import BatchPredictor
from ray.air.predictors.integrations.xgboost import XGBoostPredictor

batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint,
                                                 XGBoostPredictor)

predicted_labels = (batch_predictor.predict(test_dataset).map_batches(
    lambda df: (df > 0.5).astype(int),
    batch_format="pandas").to_pandas(limit=float("inf")))
print("PREDICTED LABELS")
print(f"{predicted_labels}")

shap_values = batch_predictor.predict(
    test_dataset, pred_contribs=True).to_pandas(limit=float("inf"))
print("SHAP VALUES")
print(f"{shap_values}")
# __air_xgb_batchpred_end__
Пример #14
0
trainer = TensorflowTrainer(
    train_loop_per_worker=train_func,
    train_loop_config=config,
    scaling_config=dict(num_workers=num_workers, use_gpu=use_gpu),
    datasets={"train": dataset},
)
result = trainer.fit()
print(result.metrics)
# __air_tf_train_end__

# __air_tf_batchpred_start__
import numpy as np

from ray.air.batch_predictor import BatchPredictor
from ray.air.predictors.integrations.tensorflow import TensorflowPredictor

batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint,
                                                 TensorflowPredictor,
                                                 model_definition=build_model)

items = [{"x": np.random.uniform(0, 1)} for _ in range(10)]
prediction_dataset = ray.data.from_items(items)

predictions = batch_predictor.predict(prediction_dataset, dtype=tf.float32)

pandas_predictions = predictions.to_pandas(float("inf"))

print(f"PREDICTIONS\n{pandas_predictions}")
# __air_tf_batchpred_end__