Python CompressedNdarrayCodec примеры использования

Язык программирования: Python

Пространство имен/Пакет: petastorm.codecs

Примеров на hotexamples.com: 6

Python CompressedNdarrayCodec - 6 примеров найдено. Это лучшие примеры Python кода для petastorm.codecs.CompressedNdarrayCodec, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CompressedNdarrayCodec(6)

decode(1)

encode(1)

Пример #1

Показать файл

Файл: test_codecs.py Проект: fishexpert/petastorm

 def test_numpy_codec(self):
     SHAPE = (10, 20, 30)
     expected = np.random.rand(*SHAPE).astype(dtype=np.int32)
     codec = CompressedNdarrayCodec()
     field = UnischemaField(name='test_name', numpy_dtype=np.int32, shape=SHAPE, codec=CompressedNdarrayCodec(),
                            nullable=False)
     np.testing.assert_equal(codec.decode(field, codec.encode(field, expected)), expected)

Пример #2

Показать файл

def main(source, target, test_size, under_sampling):
    source_data_dir_path = Path(source)
    target_data_dir_path = Path(target)

    # prepare dir for dataset
    application_data_dir_path = target_data_dir_path / 'application_classification'
    traffic_data_dir_path = target_data_dir_path / 'traffic_classification'

    # initialise local spark
    os.environ['PYSPARK_PYTHON'] = sys.executable
    os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
    memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024
    spark = (SparkSession.builder.master('local[*]').config(
        'spark.driver.memory',
        f'{memory_gb}g').config('spark.driver.host',
                                '127.0.0.1').getOrCreate())

    # prepare final schema
    schema = Unischema('data_schema', [
        UnischemaField('feature', np.float32,
                       (1, 1500), CompressedNdarrayCodec(), False),
        UnischemaField('flow_feature', np.float32,
                       (1, 76), CompressedNdarrayCodec(), False),
        UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False),
    ])

    # read data+
    df = spark.read.parquet(
        f'{source_data_dir_path.absolute().as_uri()}/*.parquet')

    # prepare data for application classification and traffic classification
    print('processing application classification dataset')
    create_train_test_for_task(df=df,
                               label_col='app_label',
                               spark=spark,
                               schema=schema,
                               test_size=test_size,
                               under_sampling=under_sampling,
                               data_dir_path=application_data_dir_path)

    print('processing traffic classification dataset')
    create_train_test_for_task(df=df,
                               label_col='traffic_label',
                               spark=spark,
                               schema=schema,
                               test_size=test_size,
                               under_sampling=under_sampling,
                               data_dir_path=traffic_data_dir_path)

    # stats
    print_df_label_distribution(spark, schema,
                                application_data_dir_path / 'train.parquet')
    print_df_label_distribution(spark, schema,
                                application_data_dir_path / 'test.parquet')
    print_df_label_distribution(spark, schema,
                                traffic_data_dir_path / 'train.parquet')
    print_df_label_distribution(spark, schema,
                                traffic_data_dir_path / 'test.parquet')

Пример #3

Показать файл

def main(train: str, test: str, target_train: str, target_test: str):
    # initialise logger
    logger = logging.getLogger(__file__)
    logger.addHandler(logging.StreamHandler())
    logger.setLevel('INFO')

    logger.info('Initialising local spark')
    spark = init_local_spark()

    logger.info('Preparing schema')
    # petastorm schema
    schema = Unischema('data_schema', [
        UnischemaField('time_window', np.str,
                       (), ScalarCodec(StringType()), False),
        UnischemaField('src_ip', np.str, (), ScalarCodec(StringType()), False),
        UnischemaField('feature', np.float32,
                       (1, 69), CompressedNdarrayCodec(), False),
        UnischemaField('label', np.str, (), ScalarCodec(StringType()), True),
    ])

    # processing train
    logger.info('Processing train parquet files')
    logger.info('Read parquet')
    train_feature_df = spark.read.parquet(train)

    logger.info('Composing features...')
    train_input = FeatureComposer(spark, train_feature_df).transform(
        remove_malicious=True, remove_null_label=True)

    logger.info('Changing schema...')
    train_input = change_df_schema(spark, schema, train_input)

    logger.info('Persisting...')
    save_parquet_for_petastorm_parquet(spark, train_input, target_train,
                                       schema)

    logger.info('Train input done')

    # processing test
    logger.info('Processing test parquet files')
    logger.info('Read parquet')
    test_feature_df = spark.read.parquet(test)

    logger.info('Composing features...')
    test_input = FeatureComposer(spark, test_feature_df).transform(
        remove_malicious=False, remove_null_label=True)

    logger.info('Changing schema...')
    test_input = change_df_schema(spark, schema, test_input)

    logger.info('Persisting...')
    save_parquet_for_petastorm_parquet(spark, test_input, target_test, schema)

    logger.info('Test input done')

Пример #4

Показать файл

Файл: frame_loader.py Проект: swati21/eva

    def __init__(self, dataset_name: str, frame_metadata: FrameInfo):

        self.dataset_name = dataset_name
        self.H = frame_metadata.height
        self.W = frame_metadata.width
        self.C = frame_metadata.num_channels

        # The schema defines how the dataset schema looks like
        self.dataset_schema = Unischema(self.dataset_name, [
            UnischemaField('frame_id', np.int32,
                           (), ScalarCodec(IntegerType()), False),
            UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C),
                           CompressedNdarrayCodec(), False),
        ])

        # Construct output location
        eva_dir = ConfigurationManager().get_value("core", "location")
        output_url = os.path.join(eva_dir, self.dataset_name)

        # Get session handle
        session = Session()
        spark = session.get_session()
        spark_context = session.get_context()

        # Wrap dataset materialization portion.
        rows_count = 10
        with materialize_dataset(spark, output_url, self.dataset_schema):

            rows_rdd = spark_context.parallelize(range(rows_count))\
                .map(lambda x: row_generator(x, self.H, self.W, self.C))\
                .map(lambda x: dict_to_spark_row(self.dataset_schema, x))

            spark.createDataFrame(rows_rdd,
                                  self.dataset_schema.as_spark_schema()) \
                .coalesce(10) \
                .write \
                .mode('overwrite') \
                .parquet(output_url)

Пример #5

Показать файл

Файл: main.py Проект: satkol/Deep-Packet

import numpy as np

source = '/home/munhou/DeepPacket/processed_data'
target = '/home/munhou/DeepPacket/www'
source_data_dir_path = Path(source)
target_data_dir_path = Path(target)

# prepare dir for dataset
application_data_dir_path = target_data_dir_path / 'application_classification'
traffic_data_dir_path = target_data_dir_path / 'traffic_classification'

# initialise local spark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024
spark = (SparkSession.builder.master('local[*]').config(
    'spark.driver.memory', f'{memory_gb}g').config('spark.driver.host',
                                                   '127.0.0.1').getOrCreate())

# prepare final schema
schema = Unischema('data_schema', [
    UnischemaField('feature', np.float32,
                   (1, 1500), CompressedNdarrayCodec(), False),
    UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False),
])
# %%
# read data
df = spark.read.parquet(
    f'{source_data_dir_path.absolute().as_uri()}/*.parquet').drop('feature')

Пример #6

Показать файл

Файл: test_codec_ndarray.py Проект: voganrc/petastorm

def test_str_special_method():
    codec = NdarrayCodec()
    assert str(codec) == 'NdarrayCodec()'

    codec = CompressedNdarrayCodec()
    assert str(codec) == 'CompressedNdarrayCodec()'