def main(source, target, test_size, under_sampling): source_data_dir_path = Path(source) target_data_dir_path = Path(target) # prepare dir for dataset application_data_dir_path = target_data_dir_path / 'application_classification' traffic_data_dir_path = target_data_dir_path / 'traffic_classification' # initialise local spark os.environ['PYSPARK_PYTHON'] = sys.executable os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024 spark = (SparkSession.builder.master('local[*]').config( 'spark.driver.memory', f'{memory_gb}g').config('spark.driver.host', '127.0.0.1').getOrCreate()) # prepare final schema schema = Unischema('data_schema', [ UnischemaField('feature', np.float32, (1, 1500), CompressedNdarrayCodec(), False), UnischemaField('flow_feature', np.float32, (1, 76), CompressedNdarrayCodec(), False), UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False), ]) # read data+ df = spark.read.parquet( f'{source_data_dir_path.absolute().as_uri()}/*.parquet') # prepare data for application classification and traffic classification print('processing application classification dataset') create_train_test_for_task(df=df, label_col='app_label', spark=spark, schema=schema, test_size=test_size, under_sampling=under_sampling, data_dir_path=application_data_dir_path) print('processing traffic classification dataset') create_train_test_for_task(df=df, label_col='traffic_label', spark=spark, schema=schema, test_size=test_size, under_sampling=under_sampling, data_dir_path=traffic_data_dir_path) # stats print_df_label_distribution(spark, schema, application_data_dir_path / 'train.parquet') print_df_label_distribution(spark, schema, application_data_dir_path / 'test.parquet') print_df_label_distribution(spark, schema, traffic_data_dir_path / 'train.parquet') print_df_label_distribution(spark, schema, traffic_data_dir_path / 'test.parquet')
def test_compressed_ndarray_codec(): SHAPE = (10, 20, 30) expected = np.random.rand(*SHAPE).astype(dtype=np.int32) codec = CompressedNdarrayCodec() field = UnischemaField(name='test_name', numpy_dtype=np.int32, shape=SHAPE, codec=CompressedNdarrayCodec(), nullable=False) np.testing.assert_equal(codec.decode(field, codec.encode(field, expected)), expected)
def main(train: str, test: str, target_train: str, target_test: str): # initialise logger logger = logging.getLogger(__file__) logger.addHandler(logging.StreamHandler()) logger.setLevel('INFO') logger.info('Initialising local spark') spark = init_local_spark() logger.info('Preparing schema') # petastorm schema schema = Unischema('data_schema', [ UnischemaField('time_window', np.str, (), ScalarCodec(StringType()), False), UnischemaField('src_ip', np.str, (), ScalarCodec(StringType()), False), UnischemaField('feature', np.float32, (1, 69), CompressedNdarrayCodec(), False), UnischemaField('label', np.str, (), ScalarCodec(StringType()), True), ]) # processing train logger.info('Processing train parquet files') logger.info('Read parquet') train_feature_df = spark.read.parquet(train) logger.info('Composing features...') train_input = FeatureComposer(spark, train_feature_df).transform( remove_malicious=True, remove_null_label=True) logger.info('Changing schema...') train_input = change_df_schema(spark, schema, train_input) logger.info('Persisting...') save_parquet_for_petastorm_parquet(spark, train_input, target_train, schema) logger.info('Train input done') # processing test logger.info('Processing test parquet files') logger.info('Read parquet') test_feature_df = spark.read.parquet(test) logger.info('Composing features...') test_input = FeatureComposer(spark, test_feature_df).transform( remove_malicious=False, remove_null_label=True) logger.info('Changing schema...') test_input = change_df_schema(spark, schema, test_input) logger.info('Persisting...') save_parquet_for_petastorm_parquet(spark, test_input, target_test, schema) logger.info('Test input done')
def __init__(self, dataset_name: str, frame_metadata: FrameInfo): self.dataset_name = dataset_name self.H = frame_metadata.height self.W = frame_metadata.width self.C = frame_metadata.num_channels # The schema defines how the dataset schema looks like self.dataset_schema = Unischema(self.dataset_name, [ UnischemaField('frame_id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C), CompressedNdarrayCodec(), False), ]) # Construct output location eva_dir = ConfigurationManager().get_value("core", "location") output_url = os.path.join(eva_dir, self.dataset_name) # Get session handle session = Session() spark = session.get_session() spark_context = session.get_context() # Wrap dataset materialization portion. rows_count = 10 with materialize_dataset(spark, output_url, self.dataset_schema): rows_rdd = spark_context.parallelize(range(rows_count))\ .map(lambda x: row_generator(x, self.H, self.W, self.C))\ .map(lambda x: dict_to_spark_row(self.dataset_schema, x)) spark.createDataFrame(rows_rdd, self.dataset_schema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .parquet(output_url)
import numpy as np source = '/home/munhou/DeepPacket/processed_data' target = '/home/munhou/DeepPacket/www' source_data_dir_path = Path(source) target_data_dir_path = Path(target) # prepare dir for dataset application_data_dir_path = target_data_dir_path / 'application_classification' traffic_data_dir_path = target_data_dir_path / 'traffic_classification' # initialise local spark os.environ['PYSPARK_PYTHON'] = sys.executable os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024 spark = (SparkSession.builder.master('local[*]').config( 'spark.driver.memory', f'{memory_gb}g').config('spark.driver.host', '127.0.0.1').getOrCreate()) # prepare final schema schema = Unischema('data_schema', [ UnischemaField('feature', np.float32, (1, 1500), CompressedNdarrayCodec(), False), UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False), ]) # %% # read data df = spark.read.parquet( f'{source_data_dir_path.absolute().as_uri()}/*.parquet').drop('feature')
def test_str_special_method(): codec = NdarrayCodec() assert str(codec) == 'NdarrayCodec()' codec = CompressedNdarrayCodec() assert str(codec) == 'CompressedNdarrayCodec()'