Exemplo n.º 1
0
    def get_petastorm_schema(name, column_list):
        petastorm_column_list = []
        for _column in column_list:
            petastorm_column = SchemaUtils.get_petastorm_column(_column)
            petastorm_column_list.append(petastorm_column)

        petastorm_schema = Unischema(name, petastorm_column_list)
        return petastorm_schema
Exemplo n.º 2
0
def test_dict_to_spark_row_field_validation_scalar_nullable():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
Exemplo n.º 3
0
def test_create_schema_view_using_regex_and_unischema_fields_with_duplicates():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    view = TestSchema.create_schema_view(['int.*$', TestSchema.int_field])
    assert set(view.fields.keys()) == {'int_field'}
Exemplo n.º 4
0
def test_create_schema_view_using_invalid_type():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    with pytest.raises(ValueError, match='must be either a string'):
        TestSchema.create_schema_view([42])
Exemplo n.º 5
0
 def test_field_name_conflict_with_unischema_attribute(self):
     # fields is an existing attribute of Unischema
     with pytest.warns(UserWarning,
                       match='Can not create dynamic property'):
         Unischema('TestSchema', [
             UnischemaField('fields', np.int32,
                            (), ScalarCodec(StringType()), True)
         ])
Exemplo n.º 6
0
def test_as_spark_schema_unspecified_codec_type_for_non_scalars_raises():
    """Do not currently support choosing spark type automatically for non-scalar types."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_vector_unspecified_codec', np.int8, (1,)),
    ])

    with pytest.raises(ValueError, match='has codec set to None'):
        TestSchema.as_spark_schema()
Exemplo n.º 7
0
def test_decode_numpy_scalar_with_unknown_dtype():
    """If numpy_dtype is None, then the value is not decoded, just passed through."""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', None, ())])
    row = {'scalar': [4, 2]}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == [4, 2]
Exemplo n.º 8
0
def test_create_schema_view_fails_validate():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    with pytest.raises(ValueError, match='does not belong to the schema'):
        TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
Exemplo n.º 9
0
def main(source, target, test_size, under_sampling):
    source_data_dir_path = Path(source)
    target_data_dir_path = Path(target)

    # prepare dir for dataset
    application_data_dir_path = target_data_dir_path / 'application_classification'
    traffic_data_dir_path = target_data_dir_path / 'traffic_classification'

    # initialise local spark
    os.environ['PYSPARK_PYTHON'] = sys.executable
    os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
    memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024
    spark = (SparkSession.builder.master('local[*]').config(
        'spark.driver.memory',
        f'{memory_gb}g').config('spark.driver.host',
                                '127.0.0.1').getOrCreate())

    # prepare final schema
    schema = Unischema('data_schema', [
        UnischemaField('feature', np.float32,
                       (1, 1500), CompressedNdarrayCodec(), False),
        UnischemaField('flow_feature', np.float32,
                       (1, 76), CompressedNdarrayCodec(), False),
        UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False),
    ])

    # read data+
    df = spark.read.parquet(
        f'{source_data_dir_path.absolute().as_uri()}/*.parquet')

    # prepare data for application classification and traffic classification
    print('processing application classification dataset')
    create_train_test_for_task(df=df,
                               label_col='app_label',
                               spark=spark,
                               schema=schema,
                               test_size=test_size,
                               under_sampling=under_sampling,
                               data_dir_path=application_data_dir_path)

    print('processing traffic classification dataset')
    create_train_test_for_task(df=df,
                               label_col='traffic_label',
                               spark=spark,
                               schema=schema,
                               test_size=test_size,
                               under_sampling=under_sampling,
                               data_dir_path=traffic_data_dir_path)

    # stats
    print_df_label_distribution(spark, schema,
                                application_data_dir_path / 'train.parquet')
    print_df_label_distribution(spark, schema,
                                application_data_dir_path / 'test.parquet')
    print_df_label_distribution(spark, schema,
                                traffic_data_dir_path / 'train.parquet')
    print_df_label_distribution(spark, schema,
                                traffic_data_dir_path / 'test.parquet')
Exemplo n.º 10
0
 def test_create_schema_view_using_regex(self):
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8,
                        (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_,
                        (), ScalarCodec(StringType()), False),
     ])
     view = TestSchema.create_schema_view(['int.*$'])
     self.assertEqual(set(view.fields.keys()), {'int_field'})
    def test_partial_application(self):
        unischema = Unischema('foo', [])
        func = partial(dict_to_spark_row, unischema)
        func({})

        # Must pass as positional arg in the right order
        func = partial(dict_to_spark_row, {})
        with self.assertRaises(AssertionError):
            func(Unischema)
Exemplo n.º 12
0
def test_as_spark_schema_unspecified_codec_type_unknown_scalar_type_raises():
    """We have a limited list of scalar types we can automatically map from numpy (+Decimal) types to spark types.
    Make sure that a ValueError is raised if an unknown type is used."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_vector_unspecified_codec', object, ()),
    ])

    with pytest.raises(ValueError, match='Was not able to map type'):
        TestSchema.as_spark_schema()
Exemplo n.º 13
0
 def test_create_schema_view_fails_validate(self):
     """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
     ])
     with self.assertRaises(ValueError) as ex:
         TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
     self.assertTrue('does not belong to the schema' in str(ex.exception))
Exemplo n.º 14
0
 def test_create_schema_view_using_invalid_type(self):
     """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
     ])
     with self.assertRaises(ValueError) as ex:
         TestSchema.create_schema_view([42])
     self.assertTrue('must be either a string' in str(ex.exception))
Exemplo n.º 15
0
def test_create_schema_view_no_field_matches_regex():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8,
                       (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_,
                       (), ScalarCodec(StringType()), False),
    ])
    view = TestSchema.create_schema_view(['bogus'])
    assert not view.fields
Exemplo n.º 16
0
def test_create_schema_view_using_unischema_fields():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8,
                       (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_,
                       (), ScalarCodec(StringType()), False),
    ])
    view = TestSchema.create_schema_view([TestSchema.int_field])
    assert set(view.fields.keys()) == {'int_field'}
Exemplo n.º 17
0
def test_decode_numpy_scalar_when_codec_is_none():
    """Decoding a row that has a field with the codec set to None. The type should be deduced automatically
    from UnischemaField's numpy_dtype attribute"""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', np.float64, ())])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)
Exemplo n.º 18
0
def test_fields():
    """Try using 'fields' getter"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])

    assert len(TestSchema.fields) == 2
    assert TestSchema.fields['int_field'].name == 'int_field'
    assert TestSchema.fields['string_field'].name == 'string_field'
Exemplo n.º 19
0
    def test_fields(self):
        """Try using 'fields' getter"""
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
            UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        ])

        self.assertEqual(len(TestSchema.fields), 2)
        self.assertEqual(TestSchema.fields['int_field'].name, 'int_field')
        self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
Exemplo n.º 20
0
def test_match_unischema_fields():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int32', np.int32, (), None, False),
        UnischemaField('uint8', np.uint8, (), None, False),
        UnischemaField('uint16', np.uint16, (), None, False),
    ])

    assert match_unischema_fields(TestSchema, ['.*nt.*6']) == [TestSchema.uint16]
    assert match_unischema_fields(TestSchema, ['nomatch']) == []
    assert set(match_unischema_fields(TestSchema, ['.*'])) == set(TestSchema.fields.values())
    assert set(match_unischema_fields(TestSchema, ['int32', 'uint8'])) == {TestSchema.int32, TestSchema.uint8}
Exemplo n.º 21
0
def main(train: str, test: str, target_train: str, target_test: str):
    # initialise logger
    logger = logging.getLogger(__file__)
    logger.addHandler(logging.StreamHandler())
    logger.setLevel('INFO')

    logger.info('Initialising local spark')
    spark = init_local_spark()

    logger.info('Preparing schema')
    # petastorm schema
    schema = Unischema('data_schema', [
        UnischemaField('time_window', np.str,
                       (), ScalarCodec(StringType()), False),
        UnischemaField('src_ip', np.str, (), ScalarCodec(StringType()), False),
        UnischemaField('feature', np.float32,
                       (1, 69), CompressedNdarrayCodec(), False),
        UnischemaField('label', np.str, (), ScalarCodec(StringType()), True),
    ])

    # processing train
    logger.info('Processing train parquet files')
    logger.info('Read parquet')
    train_feature_df = spark.read.parquet(train)

    logger.info('Composing features...')
    train_input = FeatureComposer(spark, train_feature_df).transform(
        remove_malicious=True, remove_null_label=True)

    logger.info('Changing schema...')
    train_input = change_df_schema(spark, schema, train_input)

    logger.info('Persisting...')
    save_parquet_for_petastorm_parquet(spark, train_input, target_train,
                                       schema)

    logger.info('Train input done')

    # processing test
    logger.info('Processing test parquet files')
    logger.info('Read parquet')
    test_feature_df = spark.read.parquet(test)

    logger.info('Composing features...')
    test_input = FeatureComposer(spark, test_feature_df).transform(
        remove_malicious=False, remove_null_label=True)

    logger.info('Changing schema...')
    test_input = change_df_schema(spark, schema, test_input)

    logger.info('Persisting...')
    save_parquet_for_petastorm_parquet(spark, test_input, target_test, schema)

    logger.info('Test input done')
Exemplo n.º 22
0
    def test_filter_schema_fields_from_url(self):
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int32', np.int32, (), None, False),
            UnischemaField('uint8', np.uint8, (), None, False),
            UnischemaField('uint16', np.uint16, (), None, False),
        ])

        assert match_unischema_fields(TestSchema, ['.*nt.*6']) == [TestSchema.uint16]
        assert match_unischema_fields(TestSchema, ['nomatch']) == []
        assert match_unischema_fields(TestSchema, ['.*']) == list(TestSchema.fields.values())
        assert match_unischema_fields(TestSchema, ['int32', 'uint8']) == [TestSchema.int32, TestSchema.uint8]
Exemplo n.º 23
0
def test_decode_numpy_scalar_with_explicit_scalar_codec():
    """Decoding a row that has a field with the codec set explicitly"""

    MatrixSchema = Unischema('TestSchema', [
        UnischemaField('scalar', np.float64,
                       (), ScalarCodec(DoubleType()), False)
    ])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)
Exemplo n.º 24
0
def test_dict_to_spark_row_order():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('float_col', np.float64, ()),
        UnischemaField('int_col', np.int64, ()),
    ])
    row_dict = {
        TestSchema.int_col.name: 3,
        TestSchema.float_col.name: 2.0,
    }
    spark_row = dict_to_spark_row(TestSchema, row_dict)
    schema_field_names = list(TestSchema.fields)
    assert spark_row[0] == row_dict[schema_field_names[0]]
    assert spark_row[1] == row_dict[schema_field_names[1]]
Exemplo n.º 25
0
    def test_as_spark_schema(self):
        """Try using 'as_spark_schema' function"""
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
            UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        ])

        spark_schema = TestSchema.as_spark_schema()
        self.assertEqual(spark_schema.fields[0].name, 'int_field')
        self.assertEqual(spark_schema.fields[1].name, 'string_field')

        self.assertEqual(TestSchema.fields['int_field'].name, 'int_field')
        self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
Exemplo n.º 26
0
def test_invalid_schema_field(synthetic_dataset):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False),
               predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1))

    assert 'bogus_key' in str(e)
Exemplo n.º 27
0
def test_make_named_tuple():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False),
        UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False),
        UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True),
        UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False),
    ])

    TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20,
                               int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10))

    TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20,
                               int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))
Exemplo n.º 28
0
def test_schema_to_dtype_list():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int32', np.int32, (), None, False),
        UnischemaField('uint8', np.uint8, (), None, False),
        UnischemaField('uint16', np.uint16, (), None, False),
        UnischemaField('Decimal', Decimal, (), None, False),
    ])

    actual_tf_dtype_list = _schema_to_tf_dtypes(TestSchema)
    # Note that the order of the fields is defined by alphabetical order of keys and always sorted by Unischema
    # to avoid ambiguity
    #  [Decimal,   int32,    uint16,   uint8] <- alphabetical order
    #  [tf.string, tf.int32, tf.int32, tf.uint8]
    np.testing.assert_equal(actual_tf_dtype_list, [tf.string, tf.int32, tf.int32, tf.uint8])
Exemplo n.º 29
0
def test_dict_to_spark_row_field_validation_ndarrays():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row)

    # Null value into not nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong dimensions
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)
Exemplo n.º 30
0
def test_dict_to_spark_row_field_validation_scalar_types():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row)

    # Not a nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong field type
    with pytest.raises(TypeError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row)