Python Dataset示例，data_generator.model.dataset.Dataset Python示例

示例#1

0

显示文件

def should_generate_pages_map():
    page_map = Dataset.create_page_map()

    expected_map = {'category 17': [
        'category 1', 'category 2', 'category 3', 'category 4', 'category 5', 'category 6',
        'category 7', 'category 8', 'category 9', 'category 10', 'category 11', 'category 12',
        'category 13', 'category 14', 'category 15', 'category 16', 'category 17', 'category 18',
        'category 19', 'category 20', 'category 21', 'category 22', 'category 23', 'category 24',
        'category 25', 'index', 'about', 'my-account', 'article category 17-1', 'article category 17-2',
        'article category 17-3', 'article category 17-4', 'article category 17-5',
        'article category 17-6', 'article category 17-7', 'article category 17-8',
        'article category 17-9', 'article category 17-10', 'article category 17-11',
        'article category 17-12', 'article category 17-13', 'article category 17-14',
        'article category 17-15', 'article category 17-16', 'article category 17-17', 'article category 17-18',
        'article category 17-19', 'article category 17-20', 'article category 17-21', 'article category 17-22',
        'article category 17-23', 'article category 17-24', 'article category 17-25'
    ], 'article category 25-17': [
        'category 1', 'category 2', 'category 3', 'category 4', 'category 5', 'category 6', 'category 7',
        'category 8', 'category 9', 'category 10', 'category 11', 'category 12', 'category 13', 'category 14',
        'category 15', 'category 16', 'category 17', 'category 18', 'category 19', 'category 20', 'category 21',
        'category 22', 'category 23', 'category 24', 'category 25', 'index', 'about', 'my-account'
    ], 'about': [
        'index', 'about', 'my-account', 'category 1', 'category 2', 'category 3', 'category 4', 'category 5',
        'category 6', 'category 7', 'category 8', 'category 9', 'category 10', 'category 11', 'category 12',
        'category 13', 'category 14', 'category 15', 'category 16', 'category 17', 'category 18', 'category 19',
        'category 20', 'category 21', 'category 22', 'category 23', 'category 24', 'category 25'
    ]}

    for page_name, linked_pages in expected_map.items():
        assert_that(page_map[page_name]).contains_sequence(*linked_pages)

示例#2

0

显示文件

def should_reinitialize_a_visit_with_random_duration():
    dataset = Dataset(10, 30, percentage_incomplete_data=1, percentage_inconsistent_data=1, percentage_app_v1=10,
                      percentage_app_v2=15, users_number=100, timer=Timer(-900), no_data_consent_percentage=2)
    first_visit = dataset.visits[0]

    initial_app_version = first_visit.app_version
    initial_anomaly = first_visit.data_anomaly
    initial_attributes = {**first_visit.__dict__}

    dataset.reinitialize_visit(first_visit)

    assert_that(first_visit.app_version).is_equal_to(initial_app_version)
    assert_that(first_visit.data_anomaly).is_equal_to(initial_anomaly)
    # assert only on the fields that are certainly different every time
    assert_that(first_visit.visit_id).is_not_equal_to(initial_attributes['visit_id'])
    assert_that(first_visit.user_id).is_not_equal_to(initial_attributes['user_id'])
    assert_that(first_visit.duration_seconds).is_not_equal_to(initial_attributes['duration_seconds'])

示例#3

0

显示文件

def should_create_keep_private_flags_distribution():
    keep_private_flags_distribution = Dataset.create_keep_private_flags_distribution(50, 25)

    keep_private_count = {True: 0, False: 0}
    for flag in keep_private_flags_distribution:
        keep_private_count[flag] += 1

    assert_that(keep_private_count[True]).is_equal_to(13)
    assert_that(keep_private_count[False]).is_equal_to(37)

示例#4

0

显示文件

def should_create_correct_apps_distribution():
    versions_distribution = Dataset.create_versions_distribution(50, 10, 10, 80)

    versions_count = {'v1': 0, 'v2': 0, 'v3': 0}
    for version in versions_distribution:
        versions_count[version] += 1

    assert_that(versions_count['v1']).is_equal_to(5)
    assert_that(versions_count['v2']).is_equal_to(5)
    assert_that(versions_count['v3']).is_equal_to(40)

示例#5

0

显示文件

def should_create_correct_data_anomalies_distribution():
    data_anomalies_distribution = Dataset.create_data_anomalies_distribution(60, 10, 12)

    anomalies_count = {DataAnomaly.MISSING: 0, DataAnomaly.INCOMPLETE_DATA: 0, DataAnomaly.INCONSISTENT_DATA: 0}
    for anomaly in data_anomalies_distribution:
        anomalies_count[anomaly] += 1

    # The sum is not equal to 60 but it's a rounding issue that doesn't worth to be solved right now because of
    # too small impact on the system
    assert_that(anomalies_count[DataAnomaly.MISSING]).is_equal_to(47)
    assert_that(anomalies_count[DataAnomaly.INCOMPLETE_DATA]).is_equal_to(6)
    assert_that(anomalies_count[DataAnomaly.INCONSISTENT_DATA]).is_equal_to(8)

示例#6

0

显示文件

def should_create_dataset_from_yaml_configuration():
    path = pathlib.Path(__file__).parent.absolute()
    with open('{}/dataset_configuration.yaml'.format(path)) as file:
        configuration = yaml.load(file, Loader=yaml.FullLoader)

    dataset = Dataset.from_yaml(configuration)

    counted_versions = Counter(dataset.versions_to_distribute)
    assert_that(counted_versions['v1']).is_equal_to(200)
    assert_that(counted_versions['v2']).is_equal_to(200)
    assert_that(counted_versions['v3']).is_equal_to(600)
    assert_that(dataset.timer.latency_seconds).is_equal_to(-900)
    data_quality_issues = Counter(dataset.data_anomalies_distribution)
    assert_that(data_quality_issues[DataAnomaly.MISSING]).is_equal_to(960)
    assert_that(data_quality_issues[DataAnomaly.INCOMPLETE_DATA]).is_equal_to(20)
    assert_that(data_quality_issues[DataAnomaly.INCONSISTENT_DATA]).is_equal_to(20)
    assert_that(dataset._Dataset__duration_min).is_equal_to(10)
    assert_that(dataset._Dataset__duration_max).is_equal_to(300)

示例#7

0

显示文件

文件： generate_dataset_to_kafka.py 项目： rossanomarcos/data-generator

sys.path.append(os.path.abspath(os.path.join('..', 'data-generator')))

from data_generator.model.unordered_data import UnorderedDataContainer
from data_generator.model.dataset import Dataset
from data_generator.sink.kafka_writer import KafkaWriterConfiguration


if __name__ == '__main__':
    pathname = os.path.dirname(sys.argv[0])
    configuration_file_path = os.path.join('{}/configuration.yaml'.format(os.path.abspath(pathname)))
    with open(configuration_file_path) as file:
        configuration = yaml.load(file, Loader=yaml.FullLoader)
        print('Configuration = {}'.format(configuration))

    dataset = Dataset.from_yaml(configuration)
    unordered_data_container = UnorderedDataContainer.from_yaml_with_random_distribution(configuration)

    def should_send_late_data_to_kafka():
        flags = [0] * 90 + [1] * 10
        return choice(flags)

    # give Kafka 30 seconds to start
    sleep(30)
    configuration = KafkaWriterConfiguration(configuration['kafka'])
    configuration.create_or_recreate_topics()
    output_topic_name = configuration.topics[0].name


    def get_random_duration_in_seconds():
        return randint(1, 10)

示例#8

0

显示文件

def should_create_a_correct_number_of_visits():
    dataset = Dataset(10, 30, percentage_incomplete_data=1, percentage_inconsistent_data=1, percentage_app_v1=10,
                      percentage_app_v2=15, users_number=100, timer=Timer(-900), no_data_consent_percentage=2)

    assert_that(dataset.visits).is_length(100)

示例#9

0

显示文件

文件： write_sessions_to_kinesis.py 项目： rossanomarcos/data-generator

import sys
from random import randint, choice

from data_generator.sink.kinesis_writer import KinesisWriterConfiguration

sys.path.append(os.path.abspath(os.path.join('..', 'data-generator')))

from data_generator.model.unordered_data import UnorderedDataContainer
from data_generator.model.timer import Timer
from data_generator.model.dataset import Dataset

if __name__ == '__main__':
    dataset = Dataset(duration_min_seconds=10,
                      duration_max_seconds=300,
                      percentage_incomplete_data=2,
                      percentage_inconsistent_data=2,
                      percentage_app_v1=20,
                      percentage_app_v2=20,
                      users_number=3000,
                      timer=Timer(latency_seconds=-900))

    unordered_data_container = UnorderedDataContainer(
        lambda: choice([0] * 90 + [1] * 10))

    def should_send_unordered_actions():
        flags = [0] * 90 + [1] * 10
        return choice(flags)

    output_stream_name = 'data-generator-test'
    configuration = KinesisWriterConfiguration(
        {'topics': {
            output_stream_name: {

示例#10

0

显示文件

文件： write_sessions_to_local_filesystem.py 项目： rossanomarcos/data-generator

import random

import sys

sys.path.append(os.path.abspath(os.path.join('..', 'data-generator')))
from data_generator.model.dataset import Dataset
from data_generator.model.timer import Timer
from data_generator.sink.local_filesystem_writer import LocalFileSystemConfiguration

logging.basicConfig(filename='/tmp/logs_writer.txt', level=logging.DEBUG, format='%(asctime)s %(message)s')

if __name__ == '__main__':
    timer = Timer(-3*24*60*60)  # starting from 3 days ago
    dataset = Dataset(duration_min_seconds=120, duration_max_seconds=600,
                      percentage_incomplete_data=0, percentage_inconsistent_data=0,
                      percentage_app_v1=20, percentage_app_v2=20,
                      users_number=2, timer=timer, no_data_consent_percentage=0
                      )

    def get_random_duration_in_seconds():
        return random.randint(1, 10)

    def _extract_event_time(json_data):
        json_object = json.loads(json_data)
        # TODO: try to format it with a datetime formatter
        event_time = json_object['event_time']
        return '{day}/{month}/{year}/{hour}'.format(day=event_time[0:4], month=event_time[5:7],
                                                    year=event_time[8:10], hour=event_time[11:13])

    configuration = LocalFileSystemConfiguration(
        partition_getter=_extract_event_time,