示例#1
0
    def _build_raw_model_flow(self, schema, dag):
        feature_aggregation_buckets_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id=('feature_aggregation_buckets_short_circuit_{0}'.format(schema)),
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'],
                                                                     FeatureAggregationBucketsOperatorBuilder.get_feature_aggregation_buckets_interval(PresidioDagBuilder.conf_reader),
                                                                     get_schedule_interval(dag))
        )

        feature_aggregation_buckets_operator = FeatureAggregationBucketsOperatorBuilder(schema).build(dag)

        raw_model_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='raw_model_short_circuit_{0}'.format(schema),
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'],
                                                                     RawModelOperatorBuilder.get_build_raw_model_interval(PresidioDagBuilder.conf_reader),
                                                                     get_schedule_interval(dag)) &
                                             PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date(
                                                 dag,
                                                 RawModelOperatorBuilder.get_min_gap_from_dag_start_date_to_start_raw_modeling(PresidioDagBuilder.conf_reader),
                                                 kwargs['execution_date'],
                                                 get_schedule_interval(
                                                     dag)))

        raw_model_operator = RawModelOperatorBuilder(schema).build(dag)

        feature_aggregation_buckets_short_circuit_operator >> feature_aggregation_buckets_operator >> raw_model_short_circuit_operator >> raw_model_operator
示例#2
0
    def _build_aggr_model_flow(self, schema, dag):
        acc_aggregation_operator = AccumulateAggregationsOperatorBuilder(schema, FIX_DURATION_STRATEGY_HOURLY).build(dag)

        aggr_accumulate_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='aggr_accumulate_short_circuit_{0}'.format(schema),
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'],
                                                                     AccumulateAggregationsOperatorBuilder.get_accumulate_interval(PresidioDagBuilder.conf_reader),
                                                                     get_schedule_interval(dag))
        )

        aggr_model_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='aggr_model_short_circuit_{0}'.format(schema),
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'],
                                                                     AggrModelOperatorBuilder.get_aggr_model_interval(PresidioDagBuilder.conf_reader),
                                                                     get_schedule_interval(dag)) &
                                             PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date(
                                                 dag,
                                                 AggrModelOperatorBuilder.get_min_gap_from_dag_start_date_to_start_aggr_modeling(PresidioDagBuilder.conf_reader),
                                                 kwargs['execution_date'],
                                                 get_schedule_interval(dag)))

        aggr_model_operator = AggrModelOperatorBuilder(schema).build(dag)

        aggr_accumulate_short_circuit_operator >> acc_aggregation_operator >> aggr_model_short_circuit_operator >> aggr_model_operator
示例#3
0
    def _build_output_operator(self, smart_record_conf_name, entity_type, dag,
                               smart_operator):

        self.log.debug("populating the %s dag with output tasks", dag.dag_id)

        # build hourly output processor
        task_sensor_service = TaskSensorService()
        # This operator validates that output run in intervals that are no less than hourly intervals and that the dag
        # start only after the defined gap.
        output_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='output_short_circuit',
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(
                kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY, dag.
                schedule_interval) & PresidioDagBuilder.
            validate_the_gap_between_dag_start_date_and_current_execution_date(
                dag, self._min_gap_from_dag_start_date_to_start_scoring,
                kwargs['execution_date'], dag.schedule_interval))

        hourly_output_operator = OutputOperator(
            fixed_duration_strategy=timedelta(hours=1),
            command=PresidioDagBuilder.presidio_command,
            smart_record_conf_name=smart_record_conf_name,
            entity_type=entity_type,
            dag=dag,
        )
        task_sensor_service.add_task_sequential_sensor(hourly_output_operator)
        task_sensor_service.add_task_short_circuit(
            hourly_output_operator, output_short_circuit_operator)

        # build entity score
        entity_score_operator = EntityScoreOperatorBuilder(
            smart_record_conf_name, entity_type).build(dag)
        # Create daily short circuit operator to wire the output processing and the entity score recalculation
        daily_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='output_daily_short_circuit',
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(
                kwargs['execution_date'], FIX_DURATION_STRATEGY_DAILY, dag.
                schedule_interval) & PresidioDagBuilder.
            validate_the_gap_between_dag_start_date_and_current_execution_date(
                dag,
                EntityScoreOperatorBuilder.
                get_min_gap_from_dag_start_date_to_start_modeling(
                    PresidioDagBuilder.conf_reader), kwargs[
                        'execution_date'], dag.schedule_interval))

        daily_short_circuit_operator >> entity_score_operator
        self._push_forwarding(hourly_output_operator,
                              daily_short_circuit_operator, dag, entity_type)

        smart_operator >> output_short_circuit_operator

        return entity_score_operator
示例#4
0
    def _push_forwarding(self, hourly_output_operator,
                         daily_short_circuit_operator, dag, entity_type):
        self.log.debug("creating the forwarder task")

        default_args = dag.default_args
        enable_output_forwarder = default_args.get("enable_output_forwarder")
        self.log.debug("enable_output_forwarder=%s ", enable_output_forwarder)
        if enable_output_forwarder == 'true':
            push_forwarding_operator = OutputForwarderOperator(
                command=PresidioDagBuilder.presidio_command,
                entity_type=entity_type,
                run_clean_command_before_retry=False,
                dag=dag)

            output_forward_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
                task_id='output_forward_short_circuit',
                dag=dag,
                python_callable=lambda **kwargs: is_execution_date_valid(
                    kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY, dag
                    .schedule_interval) & PresidioDagBuilder.
                validate_the_gap_between_dag_start_date_and_current_execution_date(
                    dag,
                    EntityScoreOperatorBuilder.
                    get_min_gap_from_dag_start_date_to_start_modeling(
                        PresidioDagBuilder.conf_reader), kwargs[
                            'execution_date'], dag.schedule_interval))

            hourly_output_operator >> output_forward_short_circuit_operator >> push_forwarding_operator >> daily_short_circuit_operator
        else:
            hourly_output_operator >> daily_short_circuit_operator
    def add_java_args(context):
        params = context['params']
        fixed_duration_strategy = params['retry_extra_params'][
            'fixed_duration_strategy']
        interval = params['retry_extra_params']['schedule_interval']
        context_wrapper = ContextWrapper(context)
        execution_date = context_wrapper.get_execution_date()

        if not is_execution_date_valid(execution_date, fixed_duration_strategy,
                                       interval):
            logging.info(
                'The execution date {} is not the last interval of fixed duration {}.'
                .format(execution_date, fixed_duration_strategy))

        start_date = floor_time(execution_date,
                                time_delta=fixed_duration_strategy)
        end_date = floor_time(execution_date + interval,
                              time_delta=fixed_duration_strategy)
        utc_start_date = convert_to_utc(start_date)
        utc_end_date = convert_to_utc(end_date)
        java_args = {'start_date': utc_start_date, 'end_date': utc_end_date}
        java_args = ' '.join(SpringBootJarOperator.java_args_prefix + '%s %s' %
                             (key, val)
                             for (key, val) in java_args.iteritems())
        return java_args
    def execute(self, context):
        """
   
        Checks if execution_date is last interval of fixed duration, then creates java args, otherwise skip the task. 
        java args include start_date, end_date and fixed_duration_strategy
           
        :raise InvalidExecutionDateError - Raise error if the execution_date is not the last interval of fixed duration.
        """
        context_wrapper = ContextWrapper(context)
        execution_date = context_wrapper.get_execution_date()
        if not is_execution_date_valid(
                execution_date, self.fixed_duration_strategy, self.interval):
            # e.g: execution_date = datetime(2014, 11, 28, 13, 50, 0)
            # interval = timedelta(minutes=5)
            # fixed_duration = timedelta(days=1)
            self.log.info(
                'The execution date {} is not the last interval of fixed duration {}.'
                .format(execution_date, self.fixed_duration_strategy))

        start_date = floor_time(execution_date,
                                time_delta=self.fixed_duration_strategy)
        end_date = floor_time(execution_date + self.interval,
                              time_delta=self.fixed_duration_strategy)
        utc_start_date = convert_to_utc(start_date)
        utc_end_date = convert_to_utc(end_date)
        java_args = {'start_date': utc_start_date, 'end_date': utc_end_date}
        super(FixedDurationJarOperator, self).update_java_args(java_args)
        super(FixedDurationJarOperator, self).execute(context)
    def build(self, dag):
        """
        Fill the given "Smart Model DAG" with smart accumulating operator followed by smart model build operator
        The smart accumulating operator responsible for accumulating the smart events
        The smart model build operator is respobsible for building the models
        Accumulating the data will happen once a day whereas the models might be built once a day or less (i.e. once a week)

        :param dag: The smart_model DAG to populate
        :type dag: airflow.models.DAG
        :return: The smart model DAG, after it has been populated
        :rtype: airflow.models.DAG
        """
        smart_accumulate_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='smart_accumulate_short_circuit',
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'],
                                                                     SmartModelAccumulateOperatorBuilder.get_accumulate_interval(
                                                                         PresidioDagBuilder.conf_reader),
                                                                     get_schedule_interval(dag)) &
                                             PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date(
                                                 dag,
                                                 SmartModelAccumulateOperatorBuilder.get_min_gap_from_dag_start_date_to_start_accumulating(
                                                     PresidioDagBuilder.conf_reader),
                                                 kwargs['execution_date'],
                                                 get_schedule_interval(dag))
        )

        smart_model_accumulate_operator = SmartModelAccumulateOperatorBuilder().build(dag)

        smart_model_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='smart_model_short_circuit',
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(kwargs['execution_date'],
                                                                     SmartModelOperatorBuilder.get_build_model_interval(
                                                                         PresidioDagBuilder.conf_reader),
                                                                     get_schedule_interval(dag)) &
                                             PresidioDagBuilder.validate_the_gap_between_dag_start_date_and_current_execution_date(
                                                 dag,
                                                 SmartModelOperatorBuilder.get_min_gap_from_dag_start_date_to_start_modeling(
                                                     PresidioDagBuilder.conf_reader),
                                                 kwargs['execution_date'],
                                                 get_schedule_interval(dag)))

        smart_model_operator = SmartModelOperatorBuilder().build(dag)
        smart_accumulate_short_circuit_operator >> smart_model_accumulate_operator >> smart_model_short_circuit_operator >> smart_model_operator
        return dag
    def _build_model_trigger_operator(self, dag, schema):
        model_dag_id = ModelDagFactory.get_dag_id(schema)
        python_callable = lambda context, dag_run_obj: dag_run_obj if is_execution_date_valid(
            context['execution_date'], FIX_DURATION_STRATEGY_DAILY,
            get_schedule_interval(dag)) else None
        model_trigger = self._create_expanded_trigger_dag_run_operator(
            '{0}_{1}'.format(schema, 'model_trigger_dagrun'), model_dag_id,
            dag, python_callable)

        set_schedule_interval(model_dag_id, FIX_DURATION_STRATEGY_DAILY)
        return model_trigger
    def _build_input_pre_processing_trigger_operator(self, dag, schema):
        input_pre_processing_dag_id = InputPreProcessingDagFactory.get_dag_id(
            schema)

        python_callable = lambda context, dag_run_obj: dag_run_obj if is_execution_date_valid(
            context['execution_date'], FIX_DURATION_STRATEGY_DAILY,
            get_schedule_interval(dag)) else None
        input_pre_processing_trigger = self._create_expanded_trigger_dag_run_operator(
            "{0}_input_pre_processing_trigger_dag_run".format(schema),
            input_pre_processing_dag_id, dag, python_callable)

        return input_pre_processing_trigger
 def _is_execution_date_valid(self, context):
     execution_date = context['execution_date']
     if not is_execution_date_valid(
             execution_date, self.fixed_duration_strategy, self.interval):
         # e.g: execution_date = datetime(2014, 11, 28, 13, 50, 0)
         # interval = timedelta(minutes=5)
         # fixed_duration = timedelta(days=1)
         logging.error(
             'The execution date {} is not the last interval of fixed duration {}.'
             .format(execution_date, self.fixed_duration_strategy))
         raise InvalidExecutionDateError(execution_date,
                                         self.fixed_duration_strategy)
示例#11
0
    def _build_smart(self, root_dag_gap_sensor_operator, smart_dag,
                     smart_record_conf_name):
        task_sensor_service = TaskSensorService()
        smart_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='ade_scoring_hourly_short_circuit',
            dag=smart_dag,
            python_callable=lambda **kwargs: is_execution_date_valid(
                kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY,
                smart_dag.schedule_interval) & PresidioDagBuilder.
            validate_the_gap_between_dag_start_date_and_current_execution_date(
                smart_dag, self._min_gap_from_dag_start_date_to_start_scoring,
                kwargs['execution_date'], smart_dag.schedule_interval))

        smart_operator = SmartEventsOperator(
            command=SmartEventsOperator.liors_special_run_command,
            fixed_duration_strategy=FIX_DURATION_STRATEGY_HOURLY,
            smart_events_conf=smart_record_conf_name,
            dag=smart_dag,
        )
        task_sensor_service.add_task_sequential_sensor(smart_operator)
        task_sensor_service.add_task_short_circuit(
            smart_operator, smart_short_circuit_operator)

        root_dag_gap_sensor_operator >> smart_short_circuit_operator

        smart_model_dag_id = SmartModelDagFactory.get_dag_id(
            smart_record_conf_name)

        python_callable = lambda context, dag_run_obj: dag_run_obj if is_execution_date_valid(
            context['execution_date'], FIX_DURATION_STRATEGY_DAILY, smart_dag.
            schedule_interval) else None
        smart_model_trigger = self._create_expanded_trigger_dag_run_operator(
            "smart_model_trigger", smart_model_dag_id, smart_dag,
            python_callable)

        set_schedule_interval(smart_model_dag_id, FIX_DURATION_STRATEGY_DAILY)
        smart_operator >> smart_model_trigger
        return smart_operator
示例#12
0
    def _build_alert_retention_operator(self, dag, entity_score_operator,
                                        entity_type):
        alert_retention_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='alert_retention_short_circuit',
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(
                kwargs['execution_date'],
                AlertRetentionOperatorBuilder.
                get_alert_retention_interval_in_hours(
                    PresidioDagBuilder.conf_reader), dag.schedule_interval
            ) & PresidioDagBuilder.
            validate_the_gap_between_dag_start_date_and_current_execution_date(
                dag,
                timedelta(days=AlertRetentionOperatorBuilder.
                          get_alert_min_time_to_start_retention_in_days(
                              PresidioDagBuilder.conf_reader)), kwargs[
                                  'execution_date'], dag.schedule_interval))

        alert_retention = AlertRetentionOperatorBuilder().build(
            dag, entity_type)

        entity_score_operator >> alert_retention_short_circuit_operator >> alert_retention
    def build(self, dag):
        """
        Receives an indicator DAG, creates the adapter, input and scoring operators, links them to the DAG and
        configures the dependencies between them.
        :param dag: The indicator DAG to populate
        :type dag: airflow.models.DAG
        :return: The given indicator DAG, after it has been populated
        :rtype: airflow.models.DAG
        """
        self.log.debug("populating the %s dag with input tasks", dag.dag_id)
        schema = dag.default_args.get('schema')

        adapter_operator = AdapterOperatorBuilder(schema).build(dag)

        input_task_sensor_service = TaskSensorService()
        input_operator = InputOperator(
            fixed_duration_strategy=timedelta(hours=1),
            command=PresidioDagBuilder.presidio_command,
            schema=schema,
            dag=dag)
        input_task_sensor_service.add_task_sequential_sensor(input_operator)

        self.log.debug("populating the %s dag with scoring tasks", dag.dag_id)
        scoring_task_sensor_service = TaskSensorService()
        feature_aggregations_operator = FeatureAggregationsOperator(
            fixed_duration_strategy=FIX_DURATION_STRATEGY_HOURLY,
            command=PresidioDagBuilder.presidio_command,
            data_source=schema,
            dag=dag)
        scoring_task_sensor_service.add_task_sequential_sensor(
            feature_aggregations_operator)

        score_aggregations_operator = ScoreAggregationsOperator(
            fixed_duration_strategy=FIX_DURATION_STRATEGY_HOURLY,
            command=PresidioDagBuilder.presidio_command,
            data_source=schema,
            dag=dag)
        scoring_task_sensor_service.add_task_sequential_sensor(
            score_aggregations_operator)

        hourly_short_circuit_operator = self._create_infinite_retry_short_circuit_operator(
            task_id='ade_scoring_hourly_short_circuit',
            dag=dag,
            python_callable=lambda **kwargs: is_execution_date_valid(
                kwargs['execution_date'], FIX_DURATION_STRATEGY_HOURLY,
                get_schedule_interval(dag)) & PresidioDagBuilder.
            validate_the_gap_between_dag_start_date_and_current_execution_date(
                dag, self._min_gap_from_dag_start_date_to_start_scoring,
                kwargs['execution_date'], get_schedule_interval(dag)))

        if schema in InputPreProcessingDagFactory.get_registered_schemas():
            input_pre_processing_trigger = self._build_input_pre_processing_trigger_operator(
                dag, schema)

            input_pre_processing_gap_sensor = DagIntervalGapSequentialSensorOperator(
                dag=dag,
                task_id='input_pre_processing_gap_sensor_{0}'.format(schema),
                dag_ids=[InputPreProcessingDagFactory.get_dag_id(schema)],
                interval=timedelta(hours=1),
                start_time=dag.start_date,
                fixed_duration_strategy=FIX_DURATION_STRATEGY_DAILY,
                poke_interval=5)

            input_pre_processing_gap_sensor >> input_operator >> input_pre_processing_trigger

        adapter_operator >> input_operator >> hourly_short_circuit_operator
        scoring_task_sensor_service.add_task_short_circuit(
            feature_aggregations_operator, hourly_short_circuit_operator)
        scoring_task_sensor_service.add_task_short_circuit(
            score_aggregations_operator, hourly_short_circuit_operator)

        model_trigger = self._build_model_trigger_operator(dag, schema)
        input_operator >> model_trigger

        return dag