def setUp(self): """Initializes a mock hadoop config and populate with basic properties.""" # Reset counters in mock connector self.mock.reset() self._hadoop_configuration = MockHadoopConfiguration() self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE) self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn2,nn1') self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn1'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN1) self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn2'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN2)
class FilesystemResolverTest(unittest.TestCase): """ Checks the full filesystem resolution functionality, exercising each URL interpretation case. """ @classmethod def setUpClass(cls): cls.mock = MockHdfsConnector() def setUp(self): """Initializes a mock hadoop config and populate with basic properties.""" # Reset counters in mock connector self.mock.reset() self._hadoop_configuration = MockHadoopConfiguration() self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE) self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn2,nn1') self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn1'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN1) self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn2'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN2) def test_error_url_cases(self): """Various error cases that result in exception raised.""" # Case 1: Schemeless path asserts with self.assertRaises(ValueError): CarbonFilesystemResolver(ABS_PATH, {}) # Case 4b: HDFS default path case with NO defaultFS with self.assertRaises(RuntimeError): CarbonFilesystemResolver('hdfs:///some/path', {}) # Case 4b: Using `default` as host, while apparently a pyarrow convention, is NOT valid with self.assertRaises(ArrowIOError): CarbonFilesystemResolver('hdfs://default', {}) # Case 5: other schemes result in ValueError; urlparse to cover an else branch! with self.assertRaises(ValueError): CarbonFilesystemResolver(urlparse('http://foo/bar'), {}) with self.assertRaises(ValueError): CarbonFilesystemResolver(urlparse('ftp://foo/bar'), {}) with self.assertRaises(ValueError): CarbonFilesystemResolver(urlparse('ssh://foo/bar'), {}) # s3 paths must have the bucket as the netloc with self.assertRaises(ValueError): CarbonFilesystemResolver(urlparse('s3:///foo/bar'), {}) def test_file_url(self): """ Case 2: File path, agnostic to content of hadoop configuration.""" suj = CarbonFilesystemResolver( 'file://{}'.format(ABS_PATH), hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), LocalFileSystem)) self.assertEqual('', suj.parsed_dataset_url().netloc) self.assertEqual(ABS_PATH, suj.get_dataset_path()) def test_hdfs_url_with_nameservice(self): """ Case 3a: HDFS nameservice.""" suj = CarbonFilesystemResolver( dataset_url=HC.WARP_TURTLE_PATH, hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs)) self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) def test_hdfs_url_no_nameservice(self): """ Case 3b: HDFS with no nameservice should connect to default namenode.""" suj = CarbonFilesystemResolver( dataset_url='hdfs:///some/path', hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs)) self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc) # ensure path is preserved in parsed URL self.assertEqual('/some/path', suj.get_dataset_path()) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) def test_hdfs_url_direct_namenode(self): """ Case 4: direct namenode.""" suj = CarbonFilesystemResolver( 'hdfs://{}/path'.format(HC.WARP_TURTLE_NN1), hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem())) self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) def test_hdfs_url_direct_namenode_retries(self): """ Case 4: direct namenode fails first two times thru, but 2nd retry succeeds.""" self.mock.set_fail_n_next_connect(2) with self.assertRaises(ArrowIOError): suj = CarbonFilesystemResolver( 'hdfs://{}/path'.format(HC.WARP_TURTLE_NN2), hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) with self.assertRaises(ArrowIOError): suj = CarbonFilesystemResolver( 'hdfs://{}/path'.format(HC.WARP_TURTLE_NN2), hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertEqual(2, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # this one should connect "successfully" suj = CarbonFilesystemResolver( 'hdfs://{}/path'.format(HC.WARP_TURTLE_NN2), hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem())) self.assertEqual(HC.WARP_TURTLE_NN2, suj.parsed_dataset_url().netloc) self.assertEqual(3, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) def test_s3_without_s3fs(self): with mock.patch.dict('sys.modules', s3fs=None): # `import s3fs` will fail in this context with self.assertRaises(ValueError): CarbonFilesystemResolver(urlparse('s3a://foo/bar'), {}) def test_s3_url(self): suj = CarbonFilesystemResolver( 's3a://bucket{}'.format(ABS_PATH), key=access_key, secret=secret_key, endpoint=endpoint, hadoop_configuration=self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), S3FSWrapper)) self.assertEqual('bucket', suj.parsed_dataset_url().netloc) self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path())
class FilesystemResolverTest(unittest.TestCase): """ Checks the full filesystem resolution functionality, exercising each URL interpretation case. """ @classmethod def setUpClass(cls): cls.mock = MockHdfsConnector() cls.mock_name = "mock-manager" def setUp(self): """Initializes a mock hadoop config and populate with basic properties.""" # Reset counters in mock connector self.mock.reset() self._hadoop_configuration = MockHadoopConfiguration() self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE) self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn2,nn1') self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn1'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN1) self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn2'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN2) def test_error_url_cases(self): """Various error cases that result in exception raised.""" # Case 1: Schemeless path asserts with self.assertRaises(ValueError): FilesystemResolver(ABS_PATH, {}) # Case 4b: HDFS default path case with NO defaultFS with self.assertRaises(RuntimeError): FilesystemResolver('hdfs:///some/path', {}) # Case 4b: Using `default` as host, while apparently a pyarrow convention, is NOT valid with self.assertRaises(ArrowIOError): FilesystemResolver('hdfs://default', {}) # Case 5: other schemes result in ValueError; urlparse to cover an else branch! with self.assertRaises(ValueError): FilesystemResolver(urlparse('unknown://foo/bar'), {}) # s3 paths must have the bucket as the netloc with self.assertRaises(ValueError): FilesystemResolver(urlparse('s3:///foo/bar'), {}) # GCS paths must have the bucket as the netloc with self.assertRaises(ValueError): FilesystemResolver(urlparse('gcs:///foo/bar'), {}) def test_file_url(self): """ Case 2: File path, agnostic to content of hadoop configuration.""" suj = FilesystemResolver('file://{}'.format(ABS_PATH), self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), LocalFileSystem)) self.assertEqual('', suj.parsed_dataset_url().netloc) self.assertEqual(ABS_PATH, suj.get_dataset_path()) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_hdfs_url_with_nameservice(self): """ Case 3a: HDFS nameservice.""" suj = FilesystemResolver(HC.WARP_TURTLE_PATH, self._hadoop_configuration, connector=self.mock, user=self.mock_name) self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs)) self.assertEqual(self.mock_name, suj.filesystem()._user) self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_hdfs_url_no_nameservice(self): """ Case 3b: HDFS with no nameservice should connect to default namenode.""" suj = FilesystemResolver('hdfs:///some/path', self._hadoop_configuration, connector=self.mock, user=self.mock_name) self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs)) self.assertEqual(self.mock_name, suj.filesystem()._user) self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc) # ensure path is preserved in parsed URL self.assertEqual('/some/path', suj.get_dataset_path()) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_hdfs_url_direct_namenode(self): """ Case 4: direct namenode.""" suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1), self._hadoop_configuration, connector=self.mock, user=self.mock_name) self.assertEqual(MockHdfs, type(suj.filesystem())) self.assertEqual(self.mock_name, suj.filesystem()._user) self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_hdfs_url_direct_namenode_driver_libhdfs(self): suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1), self._hadoop_configuration, connector=self.mock, hdfs_driver='libhdfs', user=self.mock_name) self.assertEqual(MockHdfs, type(suj.filesystem())) self.assertEqual(self.mock_name, suj.filesystem()._user) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_hdfs_url_direct_namenode_retries(self): """ Case 4: direct namenode fails first two times thru, but 2nd retry succeeds.""" self.mock.set_fail_n_next_connect(2) with self.assertRaises(ArrowIOError): suj = FilesystemResolver('hdfs://{}/path'.format( HC.WARP_TURTLE_NN2), self._hadoop_configuration, connector=self.mock, user=self.mock_name) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) with self.assertRaises(ArrowIOError): suj = FilesystemResolver('hdfs://{}/path'.format( HC.WARP_TURTLE_NN2), self._hadoop_configuration, connector=self.mock) self.assertEqual(2, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # this one should connect "successfully" suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN2), self._hadoop_configuration, connector=self.mock, user=self.mock_name) self.assertEqual(MockHdfs, type(suj.filesystem())) self.assertEqual(self.mock_name, suj.filesystem()._user) self.assertEqual(HC.WARP_TURTLE_NN2, suj.parsed_dataset_url().netloc) self.assertEqual(3, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) def test_s3_url(self): suj = FilesystemResolver('s3://bucket{}'.format(ABS_PATH), self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), s3fs.S3FileSystem)) self.assertEqual('bucket', suj.parsed_dataset_url().netloc) self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path()) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_gcs_url(self): suj = FilesystemResolver('gcs://bucket{}'.format(ABS_PATH), self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), gcsfs.GCSFileSystem)) self.assertEqual('bucket', suj.parsed_dataset_url().netloc) self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path()) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory()) def test_get_filesystem_and_path_or_paths(self): fs1, path1 = get_filesystem_and_path_or_paths('file:///some/path') assert isinstance(fs1, LocalFileSystem) and path1 == '/some/path' fs2, paths2 = get_filesystem_and_path_or_paths( ['file:///some/path/01.parquet', 'file:///some/path/02.parquet']) assert isinstance(fs2, LocalFileSystem) and \ paths2 == ['/some/path/01.parquet', '/some/path/02.parquet'] with self.assertRaises(ValueError): get_filesystem_and_path_or_paths([ 'file:///some/path/01.parquet', 'hdfs:///some/path/02.parquet' ])