Пример #1
0
 def __init__(self, log_name):
     self.log_name = log_name
     load_package_config('/nail/srv/configs/data_pipeline_tools.yaml')
     self.config = get_config()
     self.log = logging.getLogger(self.log_name)
     self._setup_logging()
     self.schematizer = get_schematizer()
 def __init__(self):
     super(BaseParseReplicationStream, self).__init__()
     self.db_connections = get_connection(
         config.env_config.topology_path,
         config.env_config.rbr_source_cluster,
         config.env_config.schema_tracker_cluster,
         config.env_config.rbr_state_cluster,
         is_avoid_internal_packages_set(),
         config.env_config.rbr_source_cluster_topology_name,
     )
     self.schema_wrapper = SchemaWrapper(
         db_connections=self.db_connections,
         schematizer_client=get_schematizer()
     )
     self.register_dry_run = config.env_config.register_dry_run
     self.publish_dry_run = config.env_config.publish_dry_run
     self._running = True
     self._profiler_running = False
     self._changelog_mode = config.env_config.changelog_mode
     if get_config().kafka_producer_buffer_size > config.env_config.recovery_queue_size:
         # Printing here, since this executes *before* logging is
         # configured.
         sys.stderr.write("Shutting down because kafka_producer_buffer_size was greater than \
                 recovery_queue_size")
         sys.exit(1)
Пример #3
0
def schematizer():
    schematizer = get_schematizer()
    # schematizer is a Singleton. Rerun the ctor of Schematizer per module.
    schematizer._client = get_config().schematizer_client  # swaggerpy client
    schematizer._cache = _Cache()
    schematizer._avro_schema_cache = {}
    return schematizer
Пример #4
0
 def __init__(self, log_name):
     self.log_name = log_name
     load_package_config('/nail/srv/configs/data_pipeline_tools.yaml')
     self.config = get_config()
     self.log = logging.getLogger(self.log_name)
     self._setup_logging()
     self.schematizer = get_schematizer()
Пример #5
0
def schematizer():
    schematizer = get_schematizer()
    # schematizer is a Singleton. Rerun the ctor of Schematizer per module.
    schematizer._client = get_config().schematizer_client  # swaggerpy client
    schematizer._cache = _Cache()
    schematizer._avro_schema_cache = {}
    return schematizer
Пример #6
0
 def meta_attribute_avro_schema(self, meta_attribute_avro_schema_json):
     return get_schematizer().register_schema_from_schema_json(
         namespace="test_namespace",
         source="meta_me_meta",
         schema_json=meta_attribute_avro_schema_json,
         source_owner_email="*****@*****.**",
         contains_pii=False)
Пример #7
0
 def test_missing_mandatory_meta_attributes(
     self,
     valid_message_data,
     meta_param,
     mandatory_meta_attr_ids
 ):
     with mock.patch.object(
         get_schematizer(),
         'get_meta_attributes_by_schema_id',
         return_value=mandatory_meta_attr_ids
     ):
         with pytest.raises(MissingMetaAttributeException) as e:
             self._get_dry_run_message_with_meta(
                 valid_message_data,
                 meta_param
             )
         assert e.value.args
         assert (
             "Meta Attributes with IDs `{0}` are not found for "
             "schema_id `{1}`.".format(
                 ", ".join(str(m) for m in (
                     {id for id in mandatory_meta_attr_ids} -
                     {m.schema_id for m in meta_param}
                 )),
                 valid_message_data['schema_id']
             )) in e.value.args[0]
Пример #8
0
 def monitor_schema(self):
     return get_schematizer().register_schema(
         namespace=self._monitor_schema['namespace'],
         source=self._monitor_schema['name'],
         schema_str=simplejson.dumps(self._monitor_schema),
         source_owner_email='*****@*****.**',
         contains_pii=False)
 def __init__(self):
     super(BaseParseReplicationStream, self).__init__()
     self.db_connections = get_connection(
         config.env_config.topology_path,
         config.env_config.rbr_source_cluster,
         config.env_config.schema_tracker_cluster,
         config.env_config.rbr_state_cluster,
         config.env_config.rbr_source_cluster_topology_name,
     )
     self.schema_wrapper = SchemaWrapper(
         db_connections=self.db_connections,
         schematizer_client=get_schematizer())
     self.register_dry_run = config.env_config.register_dry_run
     self.publish_dry_run = config.env_config.publish_dry_run
     self._running = True
     self._profiler_running = False
     self._changelog_mode = config.env_config.changelog_mode
     if get_config(
     ).kafka_producer_buffer_size > config.env_config.recovery_queue_size:
         # Printing here, since this executes *before* logging is
         # configured.
         sys.stderr.write(
             "Shutting down because kafka_producer_buffer_size was greater than \
                 recovery_queue_size")
         sys.exit(1)
Пример #10
0
    def process_commandline_options(self, args=None):
        super(CompactionSetter, self).process_commandline_options(args=args)

        load_package_config(self.options.config_path)
        self.dry_run = self.options.dry_run
        self.whitelist_topic = self.options.whitelist_topic
        self.schematizer = get_schematizer()
 def process_commandline_options(self, args=None):
     super(FullRefreshRequester,
           self).process_commandline_options(args=args)
     if (self.options.avg_rows_per_second_cap is not None
             and self.options.avg_rows_per_second_cap <= 0):
         raise ValueError(
             "--avg-rows-per-second-cap must be greater than 0")
     if self.options.batch_size <= 0:
         raise ValueError("--batch-size option must be greater than 0.")
     if not self.options.source_id and not (self.options.source_name
                                            and self.options.namespace):
         raise ValueError(
             "--source-id or both of--source-name and --namespace must be defined"
         )
     if self.options.source_id and (self.options.source_name
                                    or self.options.namespace):
         raise ValueError(
             "Cannot use both --source-id and either of --namespace and --source-name"
         )
     load_package_config(self.options.config_path)
     self.schematizer = get_schematizer()
     source_ids = self.get_source_ids()
     if len(source_ids) == 0:
         raise ValueError(
             "Found no sources with namespace_name {} and source_name {}".
             format(self.options.namespace, self.options.source_name))
     elif len(source_ids) > 1:
         raise ValueError(
             "Pair of namespace_name {} and source_name {} somehow received more than one"
             " source. Investigation as to how is recommended.".format(
                 self.options.namespace, self.options.source_name))
     self.source_id = source_ids[0]
Пример #12
0
 def registration_schema(self):
     schema_json = self._registration_schema
     return get_schematizer().register_schema(
         namespace=schema_json['namespace'],
         source=schema_json['name'],
         schema_str=simplejson.dumps(schema_json),
         source_owner_email='*****@*****.**',
         contains_pii=False)
Пример #13
0
 def test_setup_contains_pii_from_schematizer_once(self, message):
     schematizer_client = get_schematizer()
     with attach_spy_on_func(schematizer_client, 'get_schema_by_id') as spy:
         message.contains_pii
         assert spy.call_count == 1
     with attach_spy_on_func(schematizer_client, 'get_schema_by_id') as spy:
         message.contains_pii
         assert spy.call_count == 0
Пример #14
0
 def get_schema_json(cls):
     return get_schematizer().register_schema(
         schema_str=cls.SOURCE_SCHEMA,
         namespace='test_namespace',
         source="test_source_{}".format(randint(0, 100)),
         source_owner_email='*****@*****.**',
         contains_pii=False
     )
Пример #15
0
 def test_setup_contains_pii_from_schematizer_once(self, message):
     schematizer_client = get_schematizer()
     with attach_spy_on_func(schematizer_client, 'get_schema_by_id') as spy:
         message.contains_pii
         assert spy.call_count == 1
     with attach_spy_on_func(schematizer_client, 'get_schema_by_id') as spy:
         message.contains_pii
         assert spy.call_count == 0
Пример #16
0
 def meta_attribute_avro_schema(self, meta_attribute_avro_schema_json):
     return get_schematizer().register_schema_from_schema_json(
         namespace="test_namespace",
         source="meta_me_meta",
         schema_json=meta_attribute_avro_schema_json,
         source_owner_email="*****@*****.**",
         contains_pii=False
     )
Пример #17
0
 def mock_get_topics_by_criteria(self, topics):
     with mock.patch.object(
         get_schematizer(),
         'get_topics_by_criteria',
         return_value=topics,
         autospec=True
     ) as mock_schematizer:
         yield mock_schematizer
Пример #18
0
 def test_set_meta_with_valid_meta_attributes(self, valid_message_data,
                                              meta_param,
                                              mandatory_meta_attr_ids):
     with mock.patch.object(get_schematizer(),
                            'get_meta_attributes_by_schema_id',
                            return_value=mandatory_meta_attr_ids):
         dry_run_message = self._get_dry_run_message_with_meta(
             valid_message_data, meta_param)
         assert dry_run_message._meta == meta_param
Пример #19
0
 def registration_schema(self):
     schema_json = self._registration_schema
     return get_schematizer().register_schema(
         namespace=schema_json['namespace'],
         source=schema_json['name'],
         schema_str=simplejson.dumps(schema_json),
         source_owner_email='*****@*****.**',
         contains_pii=False
     )
def check_schematizer_has_correct_source_info(context):
    schematizer = get_schematizer()
    sources = schematizer.get_sources_by_namespace(context.data['namespace'])
    source = next(src for src in reversed(sources) if src.name == context.data['table_name'])
    topic = unlist(schematizer.get_topics_by_source_id(source.source_id))
    schema = schematizer.get_latest_schema_by_topic_name(topic.name)
    context.data['kafka_topic'] = topic.name
    setup_kafka_topic(topic.name)
    assert schema.topic.source.name == context.data['table_name']
    assert schema.topic.source.namespace.name == context.data['namespace']
    assert schema.schema_json == context.data['expected_avro_schema']
Пример #21
0
def check_schematizer_has_correct_source_info(context):
    schematizer = get_schematizer()
    sources = schematizer.get_sources_by_namespace(context.data['namespace'])
    source = next(src for src in reversed(sources)
                  if src.name == context.data['table_name'])
    topic = unlist(schematizer.get_topics_by_source_id(source.source_id))
    schema = schematizer.get_latest_schema_by_topic_name(topic.name)
    context.data['kafka_topic'] = topic.name
    setup_kafka_topic(topic.name)
    assert schema.topic.source.name == context.data['table_name']
    assert schema.topic.source.namespace.name == context.data['namespace']
    assert schema.schema_json == context.data['expected_avro_schema']
Пример #22
0
 def test_set_meta_with_valid_meta_attributes(
     self,
     valid_message_data,
     meta_param,
     mandatory_meta_attr_ids
 ):
     with mock.patch.object(
         get_schematizer(),
         'get_meta_attributes_by_schema_id',
         return_value=mandatory_meta_attr_ids
     ):
         dry_run_message = self._get_dry_run_message_with_meta(
             valid_message_data,
             meta_param
         )
         assert dry_run_message._meta == meta_param
Пример #23
0
 def _register_schema(self, namespace, source, containers):
     avro_schema = {
         'type': 'record',
         'name': source,
         'namespace': namespace,
         'doc': 'test',
         'fields': [{'type': 'int', 'doc': 'test', 'name': 'id'}]
     }
     reg_schema = get_schematizer().register_schema_from_schema_json(
         namespace=namespace,
         source=source,
         schema_json=avro_schema,
         source_owner_email='*****@*****.**',
         contains_pii=False
     )
     containers.create_kafka_topic(str(reg_schema.topic.name))
     return reg_schema
Пример #24
0
 def _setup_schematizer_topics(self):
     if self.options.namespace or self.options.source:
         schematizer = get_schematizer()
         additional_topics = schematizer.get_topics_by_criteria(
             namespace_name=self.options.namespace,
             source_name=self.options.source
         )
         if self.options.only_newest:
             additional_topics = self._filter_by_most_recently_updated(additional_topics)
         logger.info(
             "Received {} new topics from --source and --namespace options".format(
                 len(additional_topics)
             )
         )
         for topic in additional_topics:
             if str(topic.name) not in self.topic_to_offsets_map:
                 self.topic_to_offsets_map[str(topic.name)] = None
Пример #25
0
def get_transaction_id_schema_id(gtid_enabled):
    if gtid_enabled:
        file_name = GLOBAL_TRANSACTION_ID_SCHEMA_FILEPATH
        source = 'global_transaction_id'
    else:
        file_name = LOG_TRANSACTION_ID_SCHEMA_FILEPATH
        source = 'log_transaction_id'

    with open(file_name, 'r') as schema_file:
        avro_schema = simplejson.loads(schema_file.read())
    schema = get_schematizer().register_schema_from_schema_json(
        namespace='yelp.replication_handler',
        source=source,
        schema_json=avro_schema,
        source_owner_email='*****@*****.**',
        contains_pii=False,
    )
    return schema.schema_id
Пример #26
0
 def _register_avro_schema(self, namespace, source, two_fields):
     fields = [{'type': 'int', 'doc': 'test', 'name': 'foo'}]
     if two_fields:
         fields.append({'type': 'int', 'doc': 'test', 'name': 'bar'})
     schema_json = {
         'type': 'record',
         'name': source,
         'namespace': namespace,
         'doc': 'test',
         'fields': fields
     }
     return get_schematizer().register_schema(
         namespace=namespace,
         source=source,
         schema_str=simplejson.dumps(schema_json),
         source_owner_email=self.source_owner_email,
         contains_pii=False,
         base_schema_id=None)
Пример #27
0
 def test_missing_mandatory_meta_attributes(self, valid_message_data,
                                            meta_param,
                                            mandatory_meta_attr_ids):
     with mock.patch.object(get_schematizer(),
                            'get_meta_attributes_by_schema_id',
                            return_value=mandatory_meta_attr_ids):
         with pytest.raises(MissingMetaAttributeException) as e:
             self._get_dry_run_message_with_meta(valid_message_data,
                                                 meta_param)
         assert e.value.args
         assert ("Meta Attributes with IDs `{0}` are not found for "
                 "schema_id `{1}`.".format(
                     ", ".join(
                         str(m)
                         for m in ({id
                                    for id in mandatory_meta_attr_ids} -
                                   {m.schema_id
                                    for m in meta_param})),
                     valid_message_data['schema_id'])) in e.value.args[0]
Пример #28
0
 def _register_avro_schema(self, namespace, source, two_fields):
     fields = [{'type': 'int', 'doc': 'test', 'name': 'foo'}]
     if two_fields:
         fields.append({'type': 'int', 'doc': 'test', 'name': 'bar'})
     schema_json = {
         'type': 'record',
         'name': source,
         'namespace': namespace,
         'doc': 'test',
         'fields': fields
     }
     return get_schematizer().register_schema(
         namespace=namespace,
         source=source,
         schema_str=simplejson.dumps(schema_json),
         source_owner_email=self.source_owner_email,
         contains_pii=False,
         base_schema_id=None
     )
Пример #29
0
    def process_commandline_options(self, args=None):
        super(FullRefreshJob, self).process_commandline_options(args=args)
        if (self.options.avg_rows_per_second_cap is not None and
                self.options.avg_rows_per_second_cap <= 0):
            raise ValueError("--avg-rows-per-second-cap must be greater than 0")
        if self.options.batch_size <= 0:
            raise ValueError("--batch-size option must be greater than 0.")
        if not self.options.source_id and not (
            self.options.source_name and
            self.options.namespace
        ):
            raise ValueError("--source-id or both of--source-name and --namespace must be defined")
        if self.options.source_id and (
            self.options.source_name or
            self.options.namespace
        ):
            raise ValueError("Cannot use both --source-id and either of --namespace and --source-name")

        load_package_config(self.options.config_path)
        self.schematizer = get_schematizer()
Пример #30
0
 def _register_schema(self, namespace, source, containers):
     avro_schema = {
         'type': 'record',
         'name': source,
         'namespace': namespace,
         'doc': 'test',
         'fields': [{
             'type': 'int',
             'doc': 'test',
             'name': 'id'
         }]
     }
     reg_schema = get_schematizer().register_schema_from_schema_json(
         namespace=namespace,
         source=source,
         schema_json=avro_schema,
         source_owner_email='*****@*****.**',
         contains_pii=False)
     containers.create_kafka_topic(str(reg_schema.topic.name))
     return reg_schema
Пример #31
0
    def process_commandline_options(self, args=None):
        super(FullRefreshJob, self).process_commandline_options(args=args)
        if (self.options.avg_rows_per_second_cap is not None
                and self.options.avg_rows_per_second_cap <= 0):
            raise ValueError(
                "--avg-rows-per-second-cap must be greater than 0")
        if self.options.batch_size <= 0:
            raise ValueError("--batch-size option must be greater than 0.")
        if not self.options.source_id and not (self.options.source_name
                                               and self.options.namespace):
            raise ValueError(
                "--source-id or both of--source-name and --namespace must be defined"
            )
        if self.options.source_id and (self.options.source_name
                                       or self.options.namespace):
            raise ValueError(
                "Cannot use both --source-id and either of --namespace and --source-name"
            )

        load_package_config(self.options.config_path)
        self.schematizer = get_schematizer()
 def schematizer(self, containers):
     return get_schematizer()
Пример #33
0
 def __init__(self):
     self._schematizer = get_schematizer()
     self._schema_id_cache = {}
Пример #34
0
 def _schematizer(self):
     return get_schematizer()
Пример #35
0
 def schematizer(self, containers):
     return get_schematizer()
Пример #36
0
def schematizer_client(containers):
    return get_schematizer()
Пример #37
0
 def mock_get_topics_by_criteria(self, topics):
     with mock.patch.object(get_schematizer(),
                            'get_topics_by_criteria',
                            return_value=topics,
                            autospec=True) as mock_schematizer:
         yield mock_schematizer
Пример #38
0
def schematizer_client(containers):
    return get_schematizer()