def test_create_schema_view_using_regex_and_unischema_fields(self): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view( ['int.*$', TestSchema.string_field]) self.assertEqual(set(view.fields.keys()), {'int_field', 'string_field'})
def test_as_spark_schema(self): """Try using 'as_spark_schema' function""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) spark_schema = TestSchema.as_spark_schema() self.assertEqual(spark_schema.fields[0].name, 'int_field') self.assertEqual(spark_schema.fields[1].name, 'string_field') self.assertEqual(TestSchema.fields['int_field'].name, 'int_field') self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
def transform_schema(schema, transform_spec): """Creates a post-transform given a pre-transform schema and a transform_spec with mutation instructions. :param schema: A pre-transform schema :param transform_spec: a TransformSpec object with mutation instructions. :return: A post-transform schema """ removed_fields = set(transform_spec.removed_fields) unknown_field_names = removed_fields - set(schema.fields.keys()) if unknown_field_names: raise ValueError( 'Unexpected field names found in TransformSpec remove_fields list: "%s". ' 'Valid values are "%s".', ', '.join(removed_fields), ', '.join(schema.fields.keys())) exclude_fields = {f[0] for f in transform_spec.edit_fields} | removed_fields fields = [v for k, v in schema.fields.items() if k not in exclude_fields] for field_to_edit in transform_spec.edit_fields: edited_unischema_field = UnischemaField(name=field_to_edit[0], numpy_dtype=field_to_edit[1], shape=field_to_edit[2], codec=None, nullable=field_to_edit[3]) fields.append(edited_unischema_field) return Unischema(schema._name + '_transformed', fields)
def transform_schema(schema, transform_spec): """Creates a post-transform given a pre-transform schema and a transform_spec with mutation instructions. :param schema: A pre-transform schema :param transform_spec: a TransformSpec object with mutation instructions. :return: A post-transform schema """ removed_fields = set(transform_spec.removed_fields) unknown_field_names = removed_fields - set(schema.fields.keys()) if unknown_field_names: warnings.warn( 'remove_fields specified some field names that are not part of the schema. ' 'These field names will be ignored "{}". '.format( ', '.join(unknown_field_names))) exclude_fields = {f[0] for f in transform_spec.edit_fields} | removed_fields fields = [v for k, v in schema.fields.items() if k not in exclude_fields] for field_to_edit in transform_spec.edit_fields: edited_unischema_field = UnischemaField(name=field_to_edit[0], numpy_dtype=field_to_edit[1], shape=field_to_edit[2], codec=None, nullable=field_to_edit[3]) fields.append(edited_unischema_field) return Unischema(schema._name + '_transformed', fields)
def get_petastorm_schema(name, column_list): petastorm_column_list = [] for _column in column_list: petastorm_column = SchemaUtils.get_petastorm_column(_column) petastorm_column_list.append(petastorm_column) petastorm_schema = Unischema(name, petastorm_column_list) return petastorm_schema
def test_decode_numpy_scalar_with_unknown_dtype(): """If numpy_dtype is None, then the value is not decoded, just passed through.""" MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', None, ())]) row = {'scalar': [4, 2]} decoded_value = decode_row(row, MatrixSchema)['scalar'] assert decoded_value == [4, 2]
def main(source, target, test_size, under_sampling): source_data_dir_path = Path(source) target_data_dir_path = Path(target) # prepare dir for dataset application_data_dir_path = target_data_dir_path / 'application_classification' traffic_data_dir_path = target_data_dir_path / 'traffic_classification' # initialise local spark os.environ['PYSPARK_PYTHON'] = sys.executable os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024 spark = (SparkSession.builder.master('local[*]').config( 'spark.driver.memory', f'{memory_gb}g').config('spark.driver.host', '127.0.0.1').getOrCreate()) # prepare final schema schema = Unischema('data_schema', [ UnischemaField('feature', np.float32, (1, 1500), CompressedNdarrayCodec(), False), UnischemaField('flow_feature', np.float32, (1, 76), CompressedNdarrayCodec(), False), UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False), ]) # read data+ df = spark.read.parquet( f'{source_data_dir_path.absolute().as_uri()}/*.parquet') # prepare data for application classification and traffic classification print('processing application classification dataset') create_train_test_for_task(df=df, label_col='app_label', spark=spark, schema=schema, test_size=test_size, under_sampling=under_sampling, data_dir_path=application_data_dir_path) print('processing traffic classification dataset') create_train_test_for_task(df=df, label_col='traffic_label', spark=spark, schema=schema, test_size=test_size, under_sampling=under_sampling, data_dir_path=traffic_data_dir_path) # stats print_df_label_distribution(spark, schema, application_data_dir_path / 'train.parquet') print_df_label_distribution(spark, schema, application_data_dir_path / 'test.parquet') print_df_label_distribution(spark, schema, traffic_data_dir_path / 'train.parquet') print_df_label_distribution(spark, schema, traffic_data_dir_path / 'test.parquet')
def test_partial_application(self): unischema = Unischema('foo', []) func = partial(dict_to_spark_row, unischema) func({}) # Must pass as positional arg in the right order func = partial(dict_to_spark_row, {}) with self.assertRaises(AssertionError): func(Unischema)
def test_decode_numpy_scalar_when_codec_is_none(): """Decoding a row that has a field with the codec set to None. The type should be deduced automatically from UnischemaField's numpy_dtype attribute""" MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', np.float64, ())]) row = {'scalar': 42.0} decoded_value = decode_row(row, MatrixSchema)['scalar'] assert decoded_value == 42 assert isinstance(decoded_value, np.float64)