def test_hdfs_url_direct_namenode_driver_libhdfs(self): suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1), self._hadoop_configuration, connector=self.mock, hdfs_driver='libhdfs') self.assertEqual(MockHdfs, type(suj.filesystem())) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory())
def test_s3_url(self): suj = FilesystemResolver('s3://bucket{}'.format(ABS_PATH), self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), S3FSWrapper)) self.assertEqual('bucket', suj.parsed_dataset_url().netloc) self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path()) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory())
def test_file_url(self): """ Case 2: File path, agnostic to content of hadoop configuration.""" suj = FilesystemResolver('file://{}'.format(ABS_PATH), self._hadoop_configuration, connector=self.mock) self.assertTrue(isinstance(suj.filesystem(), LocalFileSystem)) self.assertEqual('', suj.parsed_dataset_url().netloc) self.assertEqual(ABS_PATH, suj.get_dataset_path()) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory())
def test_hdfs_url_with_nameservice(self): """ Case 3a: HDFS nameservice.""" suj = FilesystemResolver(HC.WARP_TURTLE_PATH, self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs)) self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory())
def test_hdfs_url_direct_namenode(self): """ Case 4: direct namenode.""" suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1), self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem())) self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory())
def test_hdfs_url_no_nameservice(self): """ Case 3b: HDFS with no nameservice should connect to default namenode.""" suj = FilesystemResolver('hdfs:///some/path', self._hadoop_configuration, connector=self.mock) self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs)) self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc) # ensure path is preserved in parsed URL self.assertEqual('/some/path', suj.get_dataset_path()) self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2)) self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1)) self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN)) # Make sure we did not capture FilesystemResolver in a closure by mistake dill.dumps(suj.filesystem_factory())
def materialize_dataset(spark, dataset_url, schema, row_group_size_mb=None, use_summary_metadata=False, filesystem_factory=None): """ A Context Manager which handles all the initialization and finalization necessary to generate metadata for a petastorm dataset. This should be used around your spark logic to materialize a dataset (specifically the writing of parquet output). Note: Any rowgroup indexing should happen outside the materialize_dataset block Example: >>> spark = SparkSession.builder... >>> ds_url = 'hdfs:///path/to/my/dataset' >>> with materialize_dataset(spark, ds_url, MyUnischema, 64): >>> spark.sparkContext.parallelize(range(0, 10)). >>> ... >>> .write.parquet(ds_url) >>> indexer = [SingleFieldIndexer(...)] >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer) A user may provide their own recipe for creation of pyarrow filesystem object in ``filesystem_factory`` argument (otherwise, petastorm will create a default one based on the url). The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used during Petastorm dataset generation: >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(), >>> hdfs_driver='libhdfs') >>> with materialize_dataset(..., filesystem_factory=resolver.filesystem_factory()): >>> ... :param spark: The spark session you are using :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``) :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset :param row_group_size_mb: The parquet row group size to use for your dataset :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom indexing method. The custom indexing method is more scalable for very large datasets. :param filesystem_factory: A filesystem factory function to be used when saving Petastorm specific metadata to the Parquet store. """ spark_config = {} _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata) yield # After job completes, add the unischema metadata and check for the metadata summary file if filesystem_factory is None: resolver = FilesystemResolver( dataset_url, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser()) filesystem_factory = resolver.filesystem_factory() dataset_path = resolver.get_dataset_path() else: dataset_path = get_dataset_path(urlparse(dataset_url)) filesystem = filesystem_factory() dataset = pq.ParquetDataset(dataset_path, filesystem=filesystem, validate_schema=False) _generate_unischema_metadata(dataset, schema) if not use_summary_metadata: _generate_num_row_groups_per_file(dataset, spark.sparkContext, filesystem_factory) # Reload the dataset to take into account the new metadata dataset = pq.ParquetDataset(dataset_path, filesystem=filesystem, validate_schema=False) try: # Try to load the row groups, if it fails that means the metadata was not generated properly load_row_groups(dataset) except PetastormMetadataError: raise PetastormMetadataGenerationError( 'Could not find summary metadata file. The dataset will exist but you will need' ' to execute petastorm-generate-metadata.py before you can read your dataset ' ' in order to generate the necessary metadata.' ' Try increasing spark driver memory next time and making sure you are' ' using parquet-mr >= 1.8.3') _cleanup_spark(spark, spark_config, row_group_size_mb)
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False, hdfs_driver='libhdfs3'): """ Generates metadata necessary to read a petastorm dataset to an existing dataset. :param spark: spark session :param dataset_url: url of existing dataset :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt to find one already in the dataset. (e.g. :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`) :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS """ sc = spark.sparkContext resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver, user=spark.sparkContext.sparkUser()) fs = resolver.filesystem() dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=fs, validate_schema=False) if unischema_class: schema = locate(unischema_class) if not isinstance(schema, Unischema): raise ValueError( 'The specified class %s is not an instance of a petastorm.Unischema object.', unischema_class) else: try: schema = get_schema(dataset) except ValueError: raise ValueError( 'Unischema class could not be located in existing dataset,' ' please specify it') # In order to be backwards compatible, we retrieve the common metadata from the dataset before # overwriting the metadata to keep row group indexes and the old row group per file index arrow_metadata = dataset.common_metadata or None with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata, filesystem_factory=resolver.filesystem_factory()): if use_summary_metadata: # Inside the materialize dataset context we just need to write the metadata file as the schema will # be written by the context manager. # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset # which will read all the footers of the dataset in parallel and merge them. hadoop_config = sc._jsc.hadoopConfiguration() Path = sc._gateway.jvm.org.apache.hadoop.fs.Path parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url)) spark.stop() if use_summary_metadata and arrow_metadata: # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information # or row group indexers. Therefore we want to retain this information and will add it to the new # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted base_schema = arrow_metadata.schema.to_arrow_schema() metadata_dict = base_schema.metadata if ROW_GROUPS_PER_FILE_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY]) if ROWGROUPS_INDEX_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
def _create_dataset(store, df, validation, compress_sparse, num_partitions, num_workers, dataset_idx, parquet_row_group_size_mb, verbose): train_data_path = store.get_train_data_path(dataset_idx) val_data_path = store.get_val_data_path(dataset_idx) if verbose >= 1: print('CEREBRO => Time: {}, Writing DataFrames'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) print('CEREBRO => Time: {}, Train Data Path: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_data_path)) print('CEREBRO => Time: {}, Val Data Path: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_data_path)) schema_cols = df.columns if isinstance(validation, str): schema_cols.append(validation) df = df[schema_cols] metadata = None if _has_vector_column(df): if compress_sparse: metadata = _get_metadata(df) to_petastorm = to_petastorm_fn(schema_cols, metadata) df = df.rdd.map(to_petastorm).toDF() train_df, val_df, validation_ratio = _train_val_split(df, validation) unischema_fields = [] metadata = _get_metadata(train_df) for k in metadata.keys(): type = spark_to_petastorm_type(metadata[k]['spark_data_type']) shape = petastorm_unischema_shape(metadata[k]['shape']) codec = petastorm_unischema_codec(metadata[k]['shape'], metadata[k]['spark_data_type']) unischema_fields.append(UnischemaField(k, type, shape, codec, False)) petastorm_schema = Unischema('petastorm_schema', unischema_fields) train_partitions = max(int(num_partitions * (1.0 - validation_ratio)), num_workers) if verbose >= 1: print('CEREBRO => Time: {}, Train Partitions: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_partitions)) spark = SparkSession.builder.getOrCreate() # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER train_resolver = FilesystemResolver( train_data_path, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser(), hdfs_driver=constants.PETASTORM_HDFS_DRIVER) with materialize_dataset( spark, train_data_path, petastorm_schema, parquet_row_group_size_mb, filesystem_factory=train_resolver.filesystem_factory()): train_rdd = train_df.rdd.map(lambda x: x.asDict()).map( lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \ .map(lambda x: dict_to_spark_row(petastorm_schema, x)) spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \ .coalesce(train_partitions) \ .write \ .mode('overwrite') \ .parquet(train_data_path) if val_df: val_partitions = max(int(num_partitions * validation_ratio), num_workers) if verbose >= 1: print('CEREBRO => Time: {}, Val Partitions: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_partitions)) val_resolver = FilesystemResolver( val_data_path, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser(), hdfs_driver=constants.PETASTORM_HDFS_DRIVER) with materialize_dataset( spark, val_data_path, petastorm_schema, parquet_row_group_size_mb, filesystem_factory=val_resolver.filesystem_factory()): val_rdd = val_df.rdd.map(lambda x: x.asDict()).map( lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \ .map(lambda x: dict_to_spark_row(petastorm_schema, x)) spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \ .coalesce(val_partitions) \ .write \ .mode('overwrite') \ .parquet(val_data_path) train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet( store, df.columns, dataset_idx) if verbose: print('CEREBRO => Time: {}, Train Rows: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows)) if val_df: if val_rows == 0: raise ValueError( 'Validation DataFrame does not any samples with validation param {}' .format(validation)) if verbose: print('CEREBRO => Time: {}, Val Rows: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_rows)) return train_rows, val_rows, pq_metadata, avg_row_size
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields, overwrite_output, partitions_count, row_group_size_mb, hdfs_driver='libhdfs3'): """ Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL values in fields defined by ``not_null_fields`` argument are filtered out. :param spark: An instance of ``SparkSession`` object :param source_url: A url of the dataset to be copied. :param target_url: A url specifying location of the target dataset. :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new dataset. :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset. :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will fail. :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target Parquet store is defined by this parameter. :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes. :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :return: None """ schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver) fields = match_unischema_fields(schema, field_regex) if field_regex and not fields: field_names = list(schema.fields.keys()) raise ValueError( 'Regular expressions (%s) do not match any fields (%s)', str(field_regex), str(field_names)) if fields: subschema = schema.create_schema_view(fields) else: subschema = schema resolver = FilesystemResolver( target_url, spark.sparkContext._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver) with materialize_dataset(spark, target_url, subschema, row_group_size_mb, filesystem_factory=resolver.filesystem_factory()): data_frame = spark.read \ .parquet(source_url) if fields: data_frame = data_frame.select(*[f.name for f in fields]) if not_null_fields: not_null_condition = reduce(operator.__and__, (data_frame[f].isNotNull() for f in not_null_fields)) data_frame = data_frame.filter(not_null_condition) if partitions_count: data_frame = data_frame.repartition(partitions_count) data_frame.write \ .mode('overwrite' if overwrite_output else 'error') \ .option('compression', 'none') \ .parquet(target_url)