Пример #1
0
 def _test_pipeline(self,
                    configuration_path,
                    processor_creator,
                    input_dir,
                    expected_result_file,
                    print_result=False):
     """
     Method for checking equals between driver result and manually generated result file
     :param configuration_path: path to config file
     :param processor_creator: processor for event_creator
     :param input_dir: path to input messages
     :param expected_result_file: path to expected result
     :param print_result: flag for only printing results to console for debugging
     :return:
     """
     table_uuid_postfix = "_" + str(uuid.uuid1()).replace("-", "_")
     configuration = Utils.load_config(configuration_path)
     pipeline = TestPipeline(configuration,
                             processor_creator(configuration), input_dir,
                             "test_result" + table_uuid_postfix)
     pipeline.process_all_available()
     result_tables_list = [[
         json.loads(row.value)
         for row in pipeline.spark.sql("select value from " +
                                       query.name).collect()
     ] for query in pipeline.spark.streams.active]
     result = [table for results in result_tables_list for table in results]
     pipeline.terminate_active_streams()
     if print_result:
         for row in result:
             print(row)
     else:
         expected_result = self.__read_expected_result(expected_result_file)
         self.maxDiff = None
         self.assertItemsEqual(expected_result, result)
Пример #2
0
                            r"(?P<finished_time>\w+?\s+?\w+?\s+?\d{1,2}\s+?\d{2}:\d{2}:\d{2}\s+?\w+?\s+?\d{4}).*"
                        ))).add_intermediate_result_parser(
                            duration_event_creator),
                Utils.get_output_topic(configuration, "reingest"))
        })


def duration_update(started_script, finished_script, finished_time, timestamp):
    """
    if started script equals finished script duration is calculated
    :param started_script
    :param finished_script
    :param finished_time
    :param timestamp
    :return: duration
    ":exception: ParsingException
    """
    if started_script == finished_script:
        return abs(finished_time - timestamp).seconds
    else:
        raise ParsingException(
            "Message contains different started and finished scripts")


if __name__ == "__main__":
    configuration = Utils.load_config(sys.argv[:])
    KafkaPipeline(
        configuration,
        LogParsingProcessor(configuration,
                            create_event_creators(configuration))).start()
Пример #3
0
import sys
import uuid

from datetime import datetime, timedelta
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

from applications.vspp_unique_activities.recording_state import RecordingState
from common.kafka_helper import KafkaHelper
from util.utils import Utils

if __name__ == "__main__":

    config = Utils.load_config(sys.argv[:])

    sc = SparkContext(appName=config.property('spark.appName'),
                      master=config.property('spark.master'))

    ssc = StreamingContext(sc, config.property('spark.batchInterval'))
    ssc.checkpoint(config.property('spark.checkpointLocation'))

    # Kafka input configs
    options = config.kafka_input_options()
    input_stream = KafkaUtils.createDirectStream(ssc,
                                                 config.kafka_input_topics(),
                                                 options)

    def parse_message(message):