Пример #1
0
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['project_id'],
                    location=self._opts['gcs_region'],
                    object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
                ), disable_if=_is_permanent_google_error)

            # Hadoop FS is responsible for all URIs that fall through to it
            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Пример #2
0
    def test_empty_fs(self):
        fs = CompositeFilesystem()

        self.assertFalse(fs.can_handle_path('s3://walrus/fish'))
        self.assertFalse(fs.can_handle_path('/'))

        self.assertRaises(IOError, fs.ls, '/')
Пример #3
0
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['google_project_id']
                ), disable_if=_is_permanent_google_error)

            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Пример #4
0
 def fs(self):
     """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
     filesystem.
     """
     if self._fs is None:
         self._fs = CompositeFilesystem(
             HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
     return self._fs
Пример #5
0
    def test_forward_put_with_part_size(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)

        fs.put('/path/to/file', 's3://walrus/file', part_size_mb=99999)
        self.s3_fs.put.assert_called_once_with(
            '/path/to/file', 's3://walrus/file', 99999)
Пример #6
0
 def fs(self):
     """:py:class:`~mrjob.fs.base.Filesystem` object for the local
     filesystem.
     """
     if self._fs is None:
         # wrap LocalFilesystem in CompositeFilesystem to get IOError
         # on URIs (see #1185)
         self._fs = CompositeFilesystem(LocalFilesystem())
     return self._fs
Пример #7
0
    def test_forward_put(self):
        # put() is a special case since the path that matters comes second
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)

        fs.put('/path/to/file', 's3://walrus/file')
        self.s3_fs.put.assert_called_once_with(
            '/path/to/file', 's3://walrus/file')
Пример #8
0
    def test_forward_join(self):
        # join() is a special case since it takes multiple arguments
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)

        self.assertEqual(fs.join('s3://walrus/fish', 'salmon'),
                         self.s3_fs.join.return_value)
        self.s3_fs.join.assert_called_once_with(
            's3://walrus/fish', 'salmon')
Пример #9
0
 def fs(self):
     """:py:class:`~mrjob.fs.base.Filesystem` object for the local
     filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects
     will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob
     0.6.0, but **this behavior is deprecated.**
     """
     if self._fs is None:
         # wrap LocalFilesystem in CompositeFilesystem to get IOError
         # on URIs (see #1185)
         self._fs = CompositeFilesystem(LocalFilesystem())
     return self._fs
Пример #10
0
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem()

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs
Пример #11
0
    def test_forward_fs_extensions(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)
        fs.add_fs('hadoop', self.hadoop_fs)

        self.assertEqual(fs.create_bucket, self.s3_fs.create_bucket)
        self.assertEqual(fs.get_hadoop_version,
                         self.hadoop_fs.get_hadoop_version)

        self.assertRaises(AttributeError, lambda: fs.client)
Пример #12
0
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            # don't pass [] to fs; this means not to use hadoop until
            # fs.set_hadoop_bin() is called (used for running hadoop over SSH).
            hadoop_bin = self._opts['hadoop_bin'] or None

            self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin))
            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Пример #13
0
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem(
            credentials=self._credentials,
            local_tmp_dir=self._get_local_tmp_dir(),
            project_id=self._project_id,
        )

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs
Пример #14
0
    def test_pick_fs(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)
        fs.add_fs('hadoop', self.hadoop_fs)

        self.assertEqual(fs.ls('s3://walrus/fish'), self.s3_fs.ls.return_value)
        # hadoop fs could have handled it, but s3_fs got it first
        self.assertTrue(self.hadoop_fs.can_handle_path('s3://walrus/fish'))
        self.assertFalse(self.hadoop_fs.ls.called)

        self.assertEqual(fs.ls('hdfs:///user/hadoop/'),
                         self.hadoop_fs.ls.return_value)

        # don't move on to the next FS on an error (unlike old
        # CompositeFilesystem implementation)
        self.s3_fs.ls.side_effect = IOError

        self.assertRaises(IOError, fs.ls, 's3://walrus/fish')
Пример #15
0
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            location = self._opts['region'] or _zone_to_region(
                self._opts['zone'])

            self._fs.add_fs('gcs', GCSFilesystem(
                credentials=self._credentials,
                project_id=self._project_id,
                part_size=self._upload_part_size(),
                location=location,
                object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
            ))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Пример #16
0
    def test_disable_fs(self):
        class NoCredentialsError(Exception):
            pass

        fs = CompositeFilesystem()

        # tentatively use S3 filesystem, if set up
        fs.add_fs('s3', self.s3_fs,
                  disable_if=lambda ex: isinstance(ex, NoCredentialsError))
        fs.add_fs('hadoop', self.hadoop_fs)

        self.s3_fs.ls.side_effect = NoCredentialsError

        # calling ls() on S3 fs disables it, so we move on to hadoop fs
        self.assertEqual(fs.ls('s3://walrus/'),
                         self.hadoop_fs.ls.return_value)
        self.assertTrue(self.s3_fs.ls.called)

        self.assertIn('s3', fs._disabled)

        # now that s3 fs is disabled, we won't even try to call it
        self.assertEqual(fs.cat('s3://walrus/fish'),
                         self.hadoop_fs.cat.return_value)
        self.assertFalse(self.s3_fs.cat.called)