예제 #1
0
def offset_range_for_timestamp_range(brokers,
                                     start,
                                     end,
                                     topic=mjolnir.kafka.TOPIC_RESULT):
    """Determine OffsetRange for a given timestamp range

    Parameters
    ----------
    brokers : list of str
        List of kafka broker hostport to bootstrap kafka connection with
    start : number
        Unix timestamp in seconds
    end : number
        Unix timestamp in seconds
    topic : str
        Kafka topic to retrieve offsets for

    Returns
    -------
    list of pyspark.streaming.kafka.OffsetRange or None
        Per-partition ranges of offsets to read
    """
    consumer = kafka.KafkaConsumer(bootstrap_servers=brokers,
                                   api_version=mjolnir.kafka.BROKER_VERSION)
    partitions = consumer.partitions_for_topic(topic)
    if partitions is None:
        # Topic does not exist.
        return None
    partitions = [kafka.TopicPartition(topic, p) for p in partitions]
    o_start = offsets_for_times(consumer, partitions, start)
    o_end = offsets_for_times(consumer, partitions, end)
    return [
        OffsetRange(tp.topic, tp.partition, o_start[tp], o_end[tp])
        for tp in partitions
    ]
예제 #2
0
파일: tests.py 프로젝트: taichi44/spark
    def test_kafka_direct_stream_foreach_get_offsetRanges(self):
        """Test the Python direct Kafka stream foreachRDD get offsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {
            "metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
            "auto.offset.reset": "smallest"
        }

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)

        offsetRanges = []

        def getOffsetRanges(_, rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)

        stream.foreachRDD(getOffsetRanges)
        self.ssc.start()
        self.wait_for(offsetRanges, 1)

        self.assertEqual(offsetRanges,
                         [OffsetRange(topic, 0, long(0), long(6))])
    def _get_new_offset_range_list(brokers, topic):
        """get offset range from earliest to latest."""
        offset_range_list = []

        # https://cwiki.apache.org/confluence/display/KAFKA/
        # A+Guide+To+The+Kafka+Protocol#
        # AGuideToTheKafkaProtocol-OffsetRequest
        GET_LATEST_OFFSETS = -1
        latest_dict = PreHourlyProcessor._get_offsets_from_kafka(
            brokers, topic, GET_LATEST_OFFSETS)

        GET_EARLIEST_OFFSETS = -2
        earliest_dict = PreHourlyProcessor._get_offsets_from_kafka(
            brokers, topic, GET_EARLIEST_OFFSETS)

        for item in latest_dict:
            until_offset = latest_dict[item].offsets[0]
            from_offset = earliest_dict[item].offsets[0]
            partition = latest_dict[item].partition
            topic = latest_dict[item].topic
            offset_range_list.append(OffsetRange(topic,
                                                 partition,
                                                 from_offset,
                                                 until_offset))

        return offset_range_list
예제 #4
0
    def emit(self):
        sc = peachbox.Spark.Instance().context()

        peachbox.Spark.Instanz = peachbox.Spark.Instance()
        sc = peachbox.Spark.Instanz.context()

        kafka_client = kafka.KafkaClient('localhost:9092')

        reqs = [OffsetRequest(self.topic, 0, -1, 10)]
        until_offset = kafka_client.send_offset_request(reqs)[0].offsets[0]

        offset_ranges = [
            OffsetRange(topic=self.topic,
                        partition=0,
                        fromOffset=self.from_offset,
                        untilOffset=until_offset)
        ]
        print 'offset range: ' + str(
            self.from_offset) + ':' + str(until_offset)

        # TODO: This is kinda hacky, resolve it
        if self.from_offset > until_offset:
            self.from_offset = until_offset

        result = pyspark.streaming.kafka.KafkaUtils.createRDD(
            sc, self.kafka_params, offset_ranges)

        self.latest_offset = until_offset

        result = result.map(lambda x: self.read_json(x[1]))
        return {'data': result}
예제 #5
0
파일: tests.py 프로젝트: taichi44/spark
    def test_kafka_direct_stream_transform_get_offsetRanges(self):
        """Test the Python direct Kafka stream transform get offsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {
            "metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
            "auto.offset.reset": "smallest"
        }

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)

        offsetRanges = []

        def transformWithOffsetRanges(rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)
            return rdd

        # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together,
        # only the TransformedDstreams can be folded together.
        stream.transform(transformWithOffsetRanges).map(
            lambda kv: kv[1]).count().pprint()
        self.ssc.start()
        self.wait_for(offsetRanges, 1)

        self.assertEqual(offsetRanges,
                         [OffsetRange(topic, 0, long(0), long(6))])
예제 #6
0
 def _compose_chunk_offset_ranges(self, chunk):
     split_chunks = split_chunks_by_parallelism(chunk, self._parallelism)
     offset_ranges = [
         OffsetRange(self._kafka_topic,
                     partition=p,
                     fromOffset=s,
                     untilOffset=e) for (p, s, e) in split_chunks if s < e
     ]
     return offset_ranges
예제 #7
0
def kafka_rdd(spark_context, kafka_brokers='192.168.1.106:9092'):
    return KafkaUtils.createRDD(
        sc=spark_context,
        kafkaParams={'metadata.broker.list': kafka_brokers},
        offsetRanges=[
            OffsetRange(topic='flights',
                        partition=0,
                        fromOffset=0,
                        untilOffset=49)
        ])
예제 #8
0
파일: tests.py 프로젝트: zyjibmcn/spark
    def test_kafka_rdd_get_offsetRanges(self):
        """Test Python direct Kafka RDD get OffsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 4, "c": 5}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self.assertEqual(offsetRanges, rdd.offsetRanges())
예제 #9
0
파일: tests.py 프로젝트: zyjibmcn/spark
    def test_kafka_rdd(self):
        """Test the Python direct Kafka RDD API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self._validateRddResult(sendData, rdd)
예제 #10
0
파일: tests.py 프로젝트: zyjibmcn/spark
    def test_kafka_rdd_with_leaders(self):
        """Test the Python direct Kafka RDD API with leaders."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
        address = self._kafkaTestUtils.brokerAddress().split(":")
        leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
        self._validateRddResult(sendData, rdd)
    def test_fetch_quantity_sum(self, usage_manager, setter_manager,
                                insert_manager, data_driven_specs_repo):

        # test operation
        test_operation = "sum"

        # load components
        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_cmpt_mgr()

        # init mock driver tables
        data_driven_specs_repo.return_value = \
            MockDataDrivenSpecsRepo(self.spark_context,
                                    self.get_pre_transform_specs_json(),
                                    self.get_transform_specs_json_by_operation(
                                        test_operation))

        # Create an emulated set of Kafka messages (these were gathered
        # by extracting Monasca messages from the Metrics queue on mini-mon).

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.kafka_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        try:
            # Call the primary method in mon_metrics_kafka
            MonMetricsKafkaProcessor.rdd_to_recordstore(
                rdd_monasca_with_offsets)
            self.assertTrue(False)
        except FetchQuantityUtilException as e:
            self.assertTrue("Operation sum is not supported" in e.value)
예제 #12
0
    def test_missing_field_to_filter(self, usage_manager, setter_manager,
                                     insert_manager, data_driven_specs_repo):
        # load components
        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_cmpt_mgr()

        # init mock driver tables
        data_driven_specs_repo.return_value = \
            MockDataDrivenSpecsRepo(
                self.spark_context,
                self.get_pre_transform_specs_json(),
                self.get_invalid_filter_transform_specs_json("",
                                                             "-mgmt$",
                                                             "exclude"))

        # Create an emulated set of Kafka messages (these were gathered
        # by extracting Monasca messages from the Metrics queue on mini-mon).

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.fetch_quantity_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        try:
            # Call the primary method in mon_metrics_kafka
            MonMetricsKafkaProcessor.rdd_to_recordstore(
                rdd_monasca_with_offsets)
            # In this case, it's an error if no exception is caught
            self.assertTrue(False)
        except FetchQuantityException as e:
            self.assertTrue("Encountered invalid filter details:" in e.value)
            self.assertTrue("field to filter = ," in e.value)
예제 #13
0
def collect_results(sc, brokers, receive_record, offsets_start, offsets_end, run_id):
    """
    Parameters
    ----------
    sc : pyspark.SparkContext
    brokers : list of str
    receive_record : callable
        Callable receiving a json decoded record from kafka. It must return
        either an empty list on error, or a 3 item tuple containing
        hit_page_id as int, query as str, and features as DenseVector
    offsets_start : list of int
        Per-partition offsets to start reading at
    offsets_end : list of int
        Per-partition offsets to end reading at
    run_id : str
        unique identifier for this run

    Returns
    -------
    pyspark.RDD
        RDD containing results of receive_record
    """

    offset_ranges = []
    if offsets_start is None:
        offsets_start = get_offset_start(brokers, mjolnir.kafka.TOPIC_RESULT)

    if offsets_start is None:
        raise RuntimeError("Cannot fetch offset_start, topic %s should have been created" % mjolnir.kafka.TOPIC_RESULT)
    for partition, (start, end) in enumerate(zip(offsets_start, offsets_end)):
        offset_ranges.append(OffsetRange(mjolnir.kafka.TOPIC_RESULT, partition, start, end))
    assert not isinstance(brokers, basestring)
    # TODO: how can we force the kafka api_version here?
    kafka_params = {
        'metadata.broker.list': ','.join(brokers),
        # Set high fetch size values so we don't fail because of large messages
        'max.partition.fetch.bytes': '40000000',
        'fetch.message.max.bytes': '40000000'
    }

    # If this ends up being too much data from kafka, blowing up memory in the
    # spark executors, we could chunk the offsets and union together multiple RDD's.
    return (
        KafkaUtils.createRDD(sc, kafka_params, offset_ranges)
        .map(lambda (k, v): json.loads(v))
        .filter(lambda rec: 'run_id' in rec and rec['run_id'] == run_id)
        .flatMap(receive_record))
예제 #14
0
    def test_invalid_aggregated_metric_name(self, usage_manager,
                                            setter_manager, insert_manager,
                                            data_driven_specs_repo):

        # load components
        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_cmpt_mgr()

        # init mock driver tables
        data_driven_specs_repo.return_value = \
            MockDataDrivenSpecsRepo(
                self.spark_context,
                self.get_pre_transform_specs_json(),
                self.get_transform_specs_json_invalid_name())

        # Create an emulated set of Kafka messages (these were gathered
        # by extracting Monasca messages from the Metrics queue on mini-mon).

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.fetch_quantity_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        # Call the primary method in mon_metrics_kafka
        MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets)

        # get the metrics that have been submitted to the dummy message adapter
        metrics = DummyAdapter.adapter_impl.metric_list

        # metrics should be empty
        self.assertFalse(metrics)
    def _get_offset_range_list(brokers,
                               topic,
                               app_name,
                               saved_offset_spec):
        """get offset range from saved offset to latest."""
        offset_range_list = []

        # https://cwiki.apache.org/confluence/display/KAFKA/
        # A+Guide+To+The+Kafka+Protocol#
        # AGuideToTheKafkaProtocol-OffsetRequest
        GET_LATEST_OFFSETS = -1
        latest_dict = PreHourlyProcessor._get_offsets_from_kafka(
            brokers, topic, GET_LATEST_OFFSETS)

        GET_EARLIEST_OFFSETS = -2
        earliest_dict = PreHourlyProcessor._get_offsets_from_kafka(
            brokers, topic, GET_EARLIEST_OFFSETS)

        saved_dict = PreHourlyProcessor._parse_saved_offsets(
            app_name, topic, saved_offset_spec)

        for item in latest_dict:
            # saved spec
            (spec_app_name,
             spec_topic_name,
             spec_partition,
             spec_from_offset,
             spec_until_offset) = saved_dict[item]

            # until
            until_offset = latest_dict[item].offsets[0]

            # from
            if spec_until_offset is not None and int(spec_until_offset) >= 0:
                from_offset = spec_until_offset
            else:
                from_offset = earliest_dict[item].offsets[0]

            partition = latest_dict[item].partition
            topic = latest_dict[item].topic
            offset_range_list.append(OffsetRange(topic,
                                                 partition,
                                                 from_offset,
                                                 until_offset))

        return offset_range_list
        media += float(i)

    return media / len(lista)


if __name__ == "__main__":

    sc = SparkContext('local[*]', 'hands on PySpark')

    kafkaParams = {"metadata.broker.list": "localhost:9092"}

    start = 1  # pular primeira linha
    until = 500000
    partition = 0
    topic = 'csvtopic'
    offset1 = OffsetRange(topic, partition, start, until)
    # offset2 = OffsetRange('csvtopic', 0, 500001, 1000000)
    offsets = [offset1]

    print(" >>>>>>>> CONSUMINDO KAFKA <<<<<<<<")

    rdd = KafkaUtils.createRDD(sc, kafkaParams, offsets)

    linhas = rdd.map(lambda x: x[1])

    # linhas.foreach(printer)



    arr = linhas.map(criarPoints)\
        .map(setDistance)\
예제 #17
0
    def kafka(self,
              host,
              topic,
              offset_ranges=None,
              key_deserializer=None,
              value_deserializer=None,
              schema=None,
              port=9092,
              parallelism=None,
              options=None):
        """Creates dataframe from specified set of messages from Kafka topic.

        Defining ranges:
            - If `offset_ranges` is specified it defines which specific range to read.
            - If `offset_ranges` is omitted it will auto-discover it's partitions.

        The `schema` parameter, if specified, should contain two top level fields:
        `key` and `value`.

        Parameters `key_deserializer` and `value_deserializer` are callables
        which get bytes as input and should return python structures as output.

        Args:
            host (str): Kafka host.
            topic (str|None): Kafka topic to read from.
            offset_ranges (list[(int, int, int)]|None): List of partition ranges
                [(partition, start_offset, end_offset)].
            key_deserializer (function): Function used to deserialize the key.
            value_deserializer (function): Function used to deserialize the value.
            schema (pyspark.sql.types.StructType): Schema to apply to create a Dataframe.
            port (int): Kafka port.
            parallelism (int|None): The max number of parallel tasks that could be executed
                during the read stage (see :ref:`controlling-the-load`).
            options (dict|None): Additional kafka parameters, see KafkaUtils.createRDD docs.

        Returns:
            pyspark.sql.DataFrame

        Raises:
            InvalidArgumentError
        """
        assert self._spark.has_package('org.apache.spark:spark-streaming-kafka')

        if not key_deserializer or not value_deserializer or not schema:
            raise InvalidArgumentError('You should specify all of parameters:'
                                       '`key_deserializer`, `value_deserializer` and `schema`')

        kafka_params = {
            'metadata.broker.list': '{}:{}'.format(host, port),
        }

        if options:
            kafka_params.update(options)

        if not offset_ranges:
            offset_ranges = kafka_get_topics_offsets(host, topic, port)

        offset_ranges = [OffsetRange(topic, partition, start_offset, end_offset)
                         for partition, start_offset, end_offset in offset_ranges]

        rdd = KafkaUtils.createRDD(self._spark.sparkContext,
                                   kafkaParams=kafka_params,
                                   offsetRanges=offset_ranges or [],
                                   keyDecoder=key_deserializer,
                                   valueDecoder=value_deserializer,
                                   )

        if parallelism:
            rdd = rdd.coalesce(parallelism)

        return self._spark.createDataFrame(rdd, schema=schema)
    def get_effective_offset_range_list(offset_range_list):
        """Get effective batch offset range.

        Effective batch offset range covers offsets starting
        from effective batch revision (defined by effective_batch_revision
        config property). By default this method will set the
        pyspark Offset.fromOffset for each partition
        to have value older than the latest revision
        (defaults to latest -1) so that prehourly processor has access
        to entire data for the hour. This will also account for and cover
        any early arriving data (data that arrives before the start hour).
        """

        offset_specifications = PreHourlyProcessor.get_offset_specs()

        app_name = PreHourlyProcessor.get_app_name()

        topic = PreHourlyProcessor.get_kafka_topic()

        # start offset revision
        effective_batch_revision = cfg.CONF.pre_hourly_processor.\
            effective_batch_revision

        effective_batch_spec = offset_specifications\
            .get_kafka_offsets_by_revision(app_name,
                                           effective_batch_revision)

        # get latest revision, if penultimate is unavailable
        if not effective_batch_spec:
            log.debug("effective batch spec: offsets: revision %s unavailable,"
                      " getting the latest revision instead..." % (
                          effective_batch_revision))
            # not available
            effective_batch_spec = offset_specifications.get_kafka_offsets(
                app_name)

        effective_batch_offsets = PreHourlyProcessor._parse_saved_offsets(
            app_name, topic,
            effective_batch_spec)

        # for debugging
        for effective_key in effective_batch_offsets.keys():
            effective_offset = effective_batch_offsets.get(effective_key,
                                                           None)
            (effect_app_name,
             effect_topic_name,
             effect_partition,
             effect_from_offset,
             effect_until_offset) = effective_offset
            log.debug(
                "effective batch offsets (from db):"
                " OffSetRanges: %s %s %s %s" % (
                    effect_topic_name, effect_partition,
                    effect_from_offset, effect_until_offset))

        # effective batch revision
        effective_offset_range_list = []
        for offset_range in offset_range_list:
            part_topic_key = "_".join((offset_range.topic,
                                       str(offset_range.partition)))
            effective_offset = effective_batch_offsets.get(part_topic_key,
                                                           None)
            if effective_offset:
                (effect_app_name,
                 effect_topic_name,
                 effect_partition,
                 effect_from_offset,
                 effect_until_offset) = effective_offset

                log.debug(
                    "Extending effective offset range:"
                    " OffSetRanges: %s %s %s-->%s %s" % (
                        effect_topic_name, effect_partition,
                        offset_range.fromOffset,
                        effect_from_offset,
                        effect_until_offset))

                effective_offset_range_list.append(
                    OffsetRange(offset_range.topic,
                                offset_range.partition,
                                effect_from_offset,
                                offset_range.untilOffset))
            else:
                effective_offset_range_list.append(
                    OffsetRange(offset_range.topic,
                                offset_range.partition,
                                offset_range.fromOffset,
                                offset_range.untilOffset))

        # return effective offset range list
        return effective_offset_range_list
예제 #19
0
def handler(rdd_mapped):
    """
    Handle prepared RDD. Each RDD item's 'payload' field append to string. 
    Create json object from string. Flter out field 'fields'. 
    Then call method 'send'. 
    """
    records = rdd_mapped.collect()
    records_str = ""

    for record in records:
        records_str = records_str + str(record['payload']) + "\n"

    json_records = json.loads(records_str)

    # filter out "fields" field
    json_records.pop('fields', None)

    sendToBroker(json.dumps(json_records, indent=2))


if __name__ == "__main__":
    """Create Spark context, create KafkaRDD, prepare RDD for filtering."""
    sc = SparkContext(appName="Kafka")
    sc.setLogLevel("WARN")

    offset = OffsetRange(TOPIC_IN, 0, 0, 16)
    rdd = KafkaUtils.createRDD(sc, {"metadata.broker.list": BROKER}, [offset])
    rdd_mapped = rdd.map(lambda v: json.loads(v[1]))
    handler(rdd_mapped)
    def test_pod_net_in_usage_app(self, usage_manager, setter_manager,
                                  insert_manager, data_driven_specs_repo):

        # load components
        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_cmpt_mgr()

        # init mock driver tables
        data_driven_specs_repo.return_value = \
            MockDataDrivenSpecsRepo(self.spark_context,
                                    self.get_pre_transform_specs_json_app(),
                                    self.get_transform_specs_json_app())

        # Create an emulated set of Kafka messages (these were gathered
        # by extracting Monasca messages from the Metrics queue on mini-mon).

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.fetch_quantity_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        # Call the primary method in mon_metrics_kafka
        MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets)

        # get the metrics that have been submitted to the dummy message adapter
        metrics = DummyAdapter.adapter_impl.metric_list

        pod_net_usage_agg_metric = [
            value for value in metrics
            if value.get('metric').get('name') == 'pod.net.in_bytes_sec_agg'
            and value.get('metric').get('dimensions').get('app') == 'junk'
            and value.get('metric').get('dimensions').get('namespace') == 'all'
            and value.get('metric').get('dimensions').get('pod_name') == 'all'
        ][0]

        self.assertTrue(pod_net_usage_agg_metric is not None)

        self.assertEqual('pod.net.in_bytes_sec_agg',
                         pod_net_usage_agg_metric.get('metric').get('name'))

        self.assertEqual(
            'junk',
            pod_net_usage_agg_metric.get("metric").get('dimensions').get(
                'app'))

        self.assertEqual(
            'all',
            pod_net_usage_agg_metric.get("metric").get('dimensions').get(
                'namespace'))

        self.assertEqual(
            'all',
            pod_net_usage_agg_metric.get("metric").get('dimensions').get(
                'pod_name'))

        self.assertEqual(122.94,
                         pod_net_usage_agg_metric.get('metric').get('value'))
        self.assertEqual('useast',
                         pod_net_usage_agg_metric.get('meta').get('region'))

        self.assertEqual(cfg.CONF.messaging.publish_kafka_project_id,
                         pod_net_usage_agg_metric.get('meta').get('tenantId'))

        self.assertEqual(
            'hourly',
            pod_net_usage_agg_metric.get('metric').get('dimensions').get(
                'aggregation_period'))

        self.assertEqual(
            3.0,
            pod_net_usage_agg_metric.get('metric').get('value_meta').get(
                'record_count'))
        self.assertEqual(
            '2017-01-24 20:14:47',
            pod_net_usage_agg_metric.get('metric').get('value_meta').get(
                'firstrecord_timestamp_string'))
        self.assertEqual(
            '2017-01-24 20:15:47',
            pod_net_usage_agg_metric.get('metric').get('value_meta').get(
                'lastrecord_timestamp_string'))
    def test_vcpus_by_all(self, usage_manager, setter_manager, insert_manager,
                          data_driven_specs_repo):

        # load components
        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_pre_hourly_cmpt_mgr()

        # init mock driver tables
        data_driven_specs_repo.return_value = \
            MockDataDrivenSpecsRepo(
                self.spark_context,
                self.get_pre_transform_specs_json_by_all(),
                self.get_transform_specs_json_by_all())

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.kafka_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        # Call the primary method in mon_metrics_kafka
        MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets)

        # get the metrics that have been submitted to the dummy message adapter
        metrics = DummyAdapter.adapter_impl.metric_list
        vm_cpu_list = map(dump_as_ascii_string, metrics)
        DummyAdapter.adapter_impl.metric_list = []

        vm_cpu_rdd = self.spark_context.parallelize(vm_cpu_list)
        sql_context = SQLContext(self.spark_context)
        vm_cpu_df = sql_context.read.json(vm_cpu_rdd)
        PreHourlyProcessor.do_transform(vm_cpu_df)

        metrics = DummyAdapter.adapter_impl.metric_list
        vcpus_agg_metric = [
            value for value in metrics
            if value.get('metric').get('name') == 'vcpus_agg' and value.get(
                'metric').get('dimensions').get('project_id') == 'all'
        ][0]

        self.assertTrue(vcpus_agg_metric is not None)

        self.assertEqual(7.0, vcpus_agg_metric.get('metric').get('value'))
        self.assertEqual('useast', vcpus_agg_metric.get('meta').get('region'))

        self.assertEqual(cfg.CONF.messaging.publish_kafka_project_id,
                         vcpus_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            vcpus_agg_metric.get('metric').get('dimensions').get('host'))
        self.assertEqual(
            'hourly',
            vcpus_agg_metric.get('metric').get('dimensions').get(
                'aggregation_period'))

        self.assertEqual(
            14.0,
            vcpus_agg_metric.get('metric').get('value_meta').get(
                'record_count'))
        self.assertEqual(
            '2016-01-20 16:40:00',
            vcpus_agg_metric.get('metric').get('value_meta').get(
                'firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-01-20 16:40:46',
            vcpus_agg_metric.get('metric').get('value_meta').get(
                'lastrecord_timestamp_string'))
    def test_fetch_quantity_avg(self, usage_manager, setter_manager,
                                insert_manager, data_driven_specs_repo):

        # test operation
        test_operation = "avg"

        # load components
        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_cmpt_mgr()

        # init mock driver tables
        data_driven_specs_repo.return_value = \
            MockDataDrivenSpecsRepo(self.spark_context,
                                    self.get_pre_transform_specs_json(),
                                    self.get_transform_specs_json_by_operation(
                                        test_operation))

        # Create an emulated set of Kafka messages (these were gathered
        # by extracting Monasca messages from the Metrics queue on mini-mon).

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.kafka_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        # Call the primary method in mon_metrics_kafka
        MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets)

        # get the metrics that have been submitted to the dummy message adapter
        metrics = DummyAdapter.adapter_impl.metric_list

        utilized_cpu_logical_agg_metric = [
            value for value in metrics if value.get('metric').get('name') ==
            'cpu.utilized_logical_cores_agg'
        ][0]

        self.assertEqual(
            7.134214285714285,
            utilized_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            utilized_cpu_logical_agg_metric.get('meta').get('region'))

        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            utilized_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('host'))
        self.assertEqual(
            'all',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('project_id'))
        self.assertEqual(
            'hourly',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('aggregation_period'))

        self.assertEqual(
            13.0,
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('lastrecord_timestamp_string'))
예제 #23
0
    def test_rdd_to_recordstore(self, usage_manager, setter_manager,
                                insert_manager):

        usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr()
        setter_manager.return_value = \
            MockComponentManager.get_setter_cmpt_mgr()
        insert_manager.return_value = \
            MockComponentManager.get_insert_pre_hourly_cmpt_mgr()

        # Create an RDD out of the mocked Monasca metrics
        with open(DataProvider.kafka_data_path) as f:
            raw_lines = f.read().splitlines()
        raw_tuple_list = [eval(raw_line) for raw_line in raw_lines]

        rdd_monasca = self.spark_context.parallelize(raw_tuple_list)

        # decorate mocked RDD with dummy kafka offsets
        myOffsetRanges = [OffsetRange("metrics", 1, 10,
                                      20)]  # mimic rdd.offsetRanges()

        transform_context = TransformContextUtils.get_context(
            offset_info=myOffsetRanges,
            batch_time_info=self.get_dummy_batch_time())

        rdd_monasca_with_offsets = rdd_monasca.map(
            lambda x: RddTransformContext(x, transform_context))

        # Call the primary method in mon_metrics_kafka
        MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets)

        host_usage_list = DummyAdapter.adapter_impl.metric_list
        host_usage_list = map(dump_as_ascii_string, host_usage_list)
        DummyAdapter.adapter_impl.metric_list = []
        host_usage_rdd = self.spark_context.parallelize(host_usage_list)
        sql_context = SQLContext(self.spark_context)
        host_usage_df = sql_context.read.json(host_usage_rdd)
        PreHourlyProcessor.do_transform(host_usage_df)

        # get the metrics that have been submitted to the dummy message adapter
        metrics = DummyAdapter.adapter_impl.metric_list

        # Verify cpu.total_logical_cores_agg for all hosts
        total_cpu_logical_agg_metric = [
            value for value in metrics
            if value.get('metric').get('name') == 'cpu.total_logical_cores_agg'
            and value.get('metric').get('dimensions').get('host') == 'all'
        ][0]

        self.assertEqual(
            15.0,
            total_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            total_cpu_logical_agg_metric.get('meta').get('region'))
        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            total_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            total_cpu_logical_agg_metric.get('metric').get('dimensions').get(
                'project_id'))
        self.assertEqual(
            'hourly',
            total_cpu_logical_agg_metric.get('metric').get('dimensions').get(
                'aggregation_period'))

        self.assertEqual(
            13.0,
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'lastrecord_timestamp_string'))

        # Verify cpu.total_logical_cores_agg for mini-mon host
        total_cpu_logical_agg_metric = [
            value for value in metrics
            if value.get('metric').get('name') == 'cpu.total_logical_cores_agg'
            and value.get('metric').get('dimensions').get(
                'host') == 'test-cp1-comp0333-mgmt'
        ][0]

        self.assertEqual(
            9.0,
            total_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            total_cpu_logical_agg_metric.get('meta').get('region'))
        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            total_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            total_cpu_logical_agg_metric.get('metric').get('dimensions').get(
                'project_id'))
        self.assertEqual(
            'hourly',
            total_cpu_logical_agg_metric.get('metric').get('dimensions').get(
                'aggregation_period'))

        self.assertEqual(
            6.0,
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'lastrecord_timestamp_string'))

        # Verify cpu.total_logical_cores_agg for devstack host
        total_cpu_logical_agg_metric = [
            value for value in metrics
            if value.get('metric').get('name') == 'cpu.total_logical_cores_agg'
            and value.get('metric').get('dimensions').get(
                'host') == 'test-cp1-comp0027-mgmt'
        ][0]

        self.assertEqual(
            6.0,
            total_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            total_cpu_logical_agg_metric.get('meta').get('region'))
        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            total_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            total_cpu_logical_agg_metric.get('metric').get('dimensions').get(
                'project_id'))
        self.assertEqual(
            'hourly',
            total_cpu_logical_agg_metric.get('metric').get('dimensions').get(
                'aggregation_period'))

        self.assertEqual(
            7.0,
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            total_cpu_logical_agg_metric.get('metric').get('value_meta').get(
                'lastrecord_timestamp_string'))

        # Verify cpu.utilized_logical_cores_agg for all hosts
        utilized_cpu_logical_agg_metric = [
            value for value in metrics if
            value.get('metric').get('name') == 'cpu.utilized_logical_cores_agg'
            and value.get('metric').get('dimensions').get('host') == 'all'
        ][0]

        self.assertEqual(
            7.134214285714285,
            utilized_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            utilized_cpu_logical_agg_metric.get('meta').get('region'))

        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            utilized_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('project_id'))
        self.assertEqual(
            'hourly',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('aggregation_period'))

        self.assertEqual(
            13.0,
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('lastrecord_timestamp_string'))

        # Verify cpu.utilized_logical_cores_agg for the mini-mon host
        utilized_cpu_logical_agg_metric = [
            value for value in metrics if value.get('metric').get('name') ==
            'cpu.utilized_logical_cores_agg' and value.get('metric').get(
                'dimensions').get('host') == 'test-cp1-comp0333-mgmt'
        ][0]

        self.assertEqual(
            4.9665,
            utilized_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            utilized_cpu_logical_agg_metric.get('meta').get('region'))

        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            utilized_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('project_id'))
        self.assertEqual(
            'hourly',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('aggregation_period'))

        self.assertEqual(
            6.0,
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('lastrecord_timestamp_string'))

        # Verify cpu.utilized_logical_cores_agg for the devstack host
        utilized_cpu_logical_agg_metric = [
            value for value in metrics if value.get('metric').get('name') ==
            'cpu.utilized_logical_cores_agg' and value.get('metric').get(
                'dimensions').get('host') == 'test-cp1-comp0027-mgmt'
        ][0]

        self.assertEqual(
            2.1677142857142853,
            utilized_cpu_logical_agg_metric.get('metric').get('value'))
        self.assertEqual(
            'useast',
            utilized_cpu_logical_agg_metric.get('meta').get('region'))

        self.assertEqual(
            cfg.CONF.messaging.publish_kafka_project_id,
            utilized_cpu_logical_agg_metric.get('meta').get('tenantId'))
        self.assertEqual(
            'all',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('project_id'))
        self.assertEqual(
            'hourly',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'dimensions').get('aggregation_period'))

        self.assertEqual(
            7.0,
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('record_count'))
        self.assertEqual(
            '2016-03-07 16:09:23',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('firstrecord_timestamp_string'))
        self.assertEqual(
            '2016-03-07 16:10:38',
            utilized_cpu_logical_agg_metric.get('metric').get(
                'value_meta').get('lastrecord_timestamp_string'))
예제 #24
0
#dstream_time_interval = 5
#ssc = pyspark.streaming.StreamingContext(sc,dstream_time_interval)

#@staticmethod
#def createRDD(sc, kafkaParams, offsetRanges, leaders={},
#keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):

kafka_params = {
    "zookeeper.connect": "localhost:2182",
    "metadata.broker.list": "localhost:9092",
    "group.id": "TutorialGroup1",
    "zookeeper.connection.timeout.ms": "10000"
}

tutorial1 = OffsetRange(topic='movie_reviews',
                        partition=0,
                        fromOffset=0,
                        untilOffset=2)

offset_ranges = [tutorial1]

kafka = pyspark.streaming.kafka.KafkaUtils.createRDD(sc, kafka_params,
                                                     offset_ranges)
kafka = kafka.map(lambda x: x[1])
print kafka.collect()
#kafka = pyspark.streaming.kafka.KafkaUtils.createStream(ssc, 'localhost:2181', 'TutorialGroup1', {'t1':1})
#kafka.pprint()

#ssc.start()
#ssc.awaitTermination()