Exemplo n.º 1
0
    def test_save_and_load_to_s3(self):
        # Mocked AWS Credentials for moto.
        os.environ["AWS_ACCESS_KEY_ID"] = "fake_access_key"
        os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_secret_key"
        os.environ["AWS_SECURITY_TOKEN"] = "fake_secrurity_token"
        os.environ["AWS_SESSION_TOKEN"] = "fake_session_token"

        s3 = boto3.client("s3", region_name="us-east-1")
        mock_bucket = "moto-mock-s3-bucket"
        # We need to create the bucket since this is all in Moto's 'virtual' AWS account
        s3.create_bucket(Bucket=mock_bucket)
        dataset_path = f"s3://{mock_bucket}/datasets/dict"

        fs = S3FileSystem(key="fake_access_key", secret="fake_secret")

        dsets = self._create_dummy_dataset_dict()
        dsets.save_to_disk(dataset_path, fs)

        del dsets

        dsets = load_from_disk(dataset_path, fs)

        self.assertListEqual(sorted(dsets), ["test", "train"])
        self.assertEqual(len(dsets["train"]), 30)
        self.assertListEqual(dsets["train"].column_names, ["filename"])
        self.assertEqual(len(dsets["test"]), 30)
        self.assertListEqual(dsets["test"].column_names, ["filename"])
        del dsets
Exemplo n.º 2
0
def test_is_remote_filesystem():

    fs = S3FileSystem(key="fake_access_key", secret="fake_secret")

    is_remote = is_remote_filesystem(fs)
    assert is_remote is True

    fs = fsspec.filesystem("file")

    is_remote = is_remote_filesystem(fs)
    assert is_remote is False
def test_distilbert_base(docker_image, processor, instance_type,
                         sagemaker_local_session, py_version):
    from datasets import load_dataset
    from transformers import AutoTokenizer

    # tokenizer used in preprocessing
    tokenizer_name = 'distilbert-base-uncased'

    # dataset used
    dataset_name = 'imdb'

    # s3 key prefix for the data
    s3_prefix = 'samples/datasets/imdb'
    # load dataset
    dataset = load_dataset(dataset_name)

    # download tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # tokenizer helper function
    def tokenize(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True)

    # load dataset
    train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
    test_dataset = test_dataset.shuffle().select(
        range(100))  # smaller the size for test dataset to 10k

    # tokenize dataset
    train_dataset = train_dataset.map(tokenize,
                                      batched=True,
                                      batch_size=len(train_dataset))
    test_dataset = test_dataset.map(tokenize,
                                    batched=True,
                                    batch_size=len(test_dataset))

    # set format for pytorch
    train_dataset.rename_column_("label", "labels")
    train_dataset.set_format('torch',
                             columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.rename_column_("label", "labels")
    test_dataset.set_format('torch',
                            columns=['input_ids', 'attention_mask', 'labels'])

    # hyperparameters, which are passed into the training job
    hyperparameters = {
        'max_steps': 5,
        'train_batch_size': 4,
        'model_name': 'distilbert-base-uncased'
    }

    s3 = S3FileSystem()

    # save train_dataset to s3
    training_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train'
    train_dataset.save_to_disk(training_input_path, fs=s3)

    # save test_dataset to s3
    test_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test'
    test_dataset.save_to_disk(test_input_path, fs=s3)

    estimator = HuggingFace(entry_point=distrilbert_script,
                            instance_type='local_gpu',
                            sagemaker_session=sagemaker_local_session,
                            image_uri=docker_image,
                            instance_count=1,
                            role=ROLE,
                            py_version=py_version,
                            hyperparameters=hyperparameters)

    estimator.fit({
        'train':
        f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train',
        'test':
        f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test'
    })