示例#1
0
    def run(self):
        # redirect stdout and stderr to queue
        logger.redirect_output(self.event_queue, self.task.path())

        succeeded = True
        attempt = 0
        try:
            while True:
                if not self.task.run():
                    if attempt < self.task.max_retries:
                        attempt += 1
                        delay = pow(2, attempt + 2)
                        logger.log(
                            message=
                            f'Retry {attempt}/{self.task.max_retries} in {delay} seconds',
                            is_error=True,
                            format=logger.Format.ITALICS)
                        time.sleep(delay)
                    else:
                        succeeded = False
                        break
                else:
                    break
        except Exception as e:
            logger.log(message=traceback.format_exc(),
                       format=logger.Format.VERBATIM,
                       is_error=True)
            succeeded = False

        self.status_queue.put(succeeded)
示例#2
0
    def read_process_output():
        for line in process.stdout:
            output_lines.append(line)
            logger.log(line, format=logger.Format.VERBATIM)

        for line in process.stderr:
            logger.log(line, format=logger.Format.VERBATIM, is_error=True)
示例#3
0
def run_shell_command(command: str, log_command: bool = True):
    """
    Runs a command in a bash shell and logs the output of the command in (near)real-time.

    Args:
        command: The command to run
        log_command: When true, then the command itself is logged before execution

    Returns:
        Either (in order)
        - False when the exit code of the command was not 0
        - True when there was no output to stdout
        - The output to stdout, as an array of lines
    """
    if log_command:
        logger.log(command, format=logger.Format.ITALICS)

    process = subprocess.Popen(shlex.split(config.bash_command_string()) +
                               ['-c', command],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               universal_newlines=True)

    # keep stdout output
    output_lines = []

    # unfortunately, only file descriptors and the system stream can be passed to
    # subprocess.Popen(..) (and not custom streams without a file handle).
    # So in order to see be able to log the output in real-time, we have to
    # query the output steams of the process from to separate threads
    def read_process_output():
        for line in process.stdout:
            output_lines.append(line)
            logger.log(line, format=logger.Format.VERBATIM)

        for line in process.stderr:
            logger.log(line, format=logger.Format.VERBATIM, is_error=True)

    read_output_thread = threading.Thread(target=read_process_output)
    read_output_thread.start()

    # wait until the process finishes
    while process.poll() is None:
        time.sleep(0.005)

    read_output_thread.join()

    exitcode = process.returncode
    if exitcode != 0:
        logger.log(f'exit code {exitcode}',
                   is_error=True,
                   format=logger.Format.ITALICS)
        return False

    return output_lines or True
示例#4
0
    def run(self):
        # redirect stdout and stderr to queue
        logger.redirect_output(self.event_queue, self.task.path())

        succeeded = True
        try:

            if not self.task.run():
                succeeded = False
        except Exception as e:
            logger.log(message=traceback.format_exc(),
                       format=logger.Format.VERBATIM,
                       is_error=True)
            succeeded = False

        self.status_queue.put(succeeded)
示例#5
0
 def run(self):
     from data_integration import shell
     from data_integration.logging import logger
     pipeline_base_directory = self.parent.parent.base_path()
     excludes = ' --exclude=__init__.py --exclude=\*.md --exclude=\*.pyc'
     # cd'ing && grepping in . allows us to show short filenames
     # The "(...) || true" will ensure that we do not get any output if nothing is found
     shell_command = f'(cd "{pipeline_base_directory}" && egrep --recursive {excludes} "{self.pattern}" .) || true'
     lines_or_bool = shell.run_shell_command(shell_command)
     if lines_or_bool is True:
         return True
     else:
         # The || true makes sure we will not get any False
         logger.log(
             f"Please don\'t use the pattern '{self.pattern}' in this pipeline. Matching lines:",
             format=logger.Format.ITALICS)
         lines = '\n'.join(lines_or_bool)
         logger.log(f"{lines}", format=logger.Format.ITALICS)
         return False
示例#6
0
    def run(self) -> bool:
        if self.sql_file_name:
            logger.log(self.sql_file_name, logger.Format.ITALICS)

        dependency_type = 'ExecuteSQL ' + (self.sql_file_name or self.sql_statement)

        if self.file_dependencies:
            assert (self.parent)
            pipeline_base_path = self.parent.parent.base_path()
            if not file_dependencies.is_modified(self.node_path(), dependency_type,
                                                 pipeline_base_path,
                                                 self.file_dependencies):
                logger.log('no changes')
                return True

        if not super().run():
            return False

        if self.file_dependencies:
            file_dependencies.update(self.node_path(), dependency_type, pipeline_base_path, self.file_dependencies)
        return True
示例#7
0
    def run(self) -> bool:
        # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp)
        # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers,
        # which can be hard for example in the case of SQL Server
        logger.log(f'get highest modification comparison value',
                   format=logger.Format.ITALICS)
        max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}'
        logger.log(max_value_query, format=logger.Format.VERBATIM)
        result = shell.run_shell_command(
            f'echo {shlex.quote(max_value_query)} \\\n  | ' +
            mara_db.shell.copy_to_stdout_command(self.source_db_alias))

        if not result:
            return False

        # be flexible with different output formats: remove the column header & remove whitespace & quotes
        max_modification_value = ''.join(result).replace('maxval',
                                                         '').strip().strip('"')
        logger.log(repr(max_modification_value), format=logger.Format.VERBATIM)

        # check whether target table is empty
        target_table_is_empty = True

        target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1'
        logger.log(f'check if target table is empty',
                   format=logger.Format.ITALICS)
        logger.log(target_table_empty_query, format=logger.Format.VERBATIM)
        with mara_db.postgresql.postgres_cursor_context(
                self.target_db_alias) as cursor:
            cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1')
            target_table_is_empty = not cursor.fetchone()
            logger.log(
                f"target table{'' if target_table_is_empty else ' not'} empty",
                format=logger.Format.ITALICS)

        # get last comparison value
        logger.log('get last comparison value', format=logger.Format.ITALICS)
        last_comparison_value = incremental_copy_status.get_last_comparison_value(
            self.node_path(), self.source_db_alias, self.source_table)
        logger.log(repr(last_comparison_value), format=logger.Format.VERBATIM)

        if target_table_is_empty or not last_comparison_value:
            # full load
            logger.log('full (non incremental) copy', logger.Format.ITALICS)
            if not target_table_is_empty:
                truncate_query = f'TRUNCATE TABLE {self.target_table}'
                logger.log(truncate_query, format=logger.Format.VERBATIM)
                with mara_db.postgresql.postgres_cursor_context(
                        self.target_db_alias) as cursor:
                    cursor.execute(truncate_query)

            # overwrite the comparison criteria to get everything
            replace = {self.comparison_value_placeholder: '(1=1)'}
            complete_copy_command = self._copy_command(self.target_table,
                                                       replace)
            if not shell.run_shell_command(complete_copy_command):
                return False

        else:
            # incremental load. First create the table which will contain the delta
            logger.log('incremental copy, create upsert table',
                       logger.Format.ITALICS)
            create_upsert_table_query = (
                f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n' +
                f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE'
            )

            if not shell.run_shell_command(
                    f'echo {shlex.quote(create_upsert_table_query)} \\\n  | ' +
                    mara_db.shell.query_command(self.target_db_alias,
                                                echo_queries=True)):
                return False

            # perform the actual copy replacing the placeholder
            # with the comparison value from the latest successful execution
            replace = {
                self.comparison_value_placeholder:
                f'({self.modification_comparison} >= \'{last_comparison_value}\')'
            }
            if not shell.run_shell_command(
                    self._copy_command(self.target_table + '_upsert',
                                       replace)):
                return False

            # now the upsert table has to be merged with the target one

            # retrieve the target table columns to build the SET clause of the upsert query
            with mara_db.postgresql.postgres_cursor_context(
                    self.target_db_alias) as cursor:
                retrieve_column_query = f"SELECT attname FROM pg_attribute WHERE attrelid = '{self.target_table}'::REGCLASS AND attnum > 0;"
                logger.log(retrieve_column_query,
                           format=logger.Format.VERBATIM)
                cursor.execute(retrieve_column_query)
                set_clause = ', '.join([
                    f'"{col[0]}" = EXCLUDED."{col[0]}"'
                    for col in cursor.fetchall()
                ])

            upsery_query = f"""
INSERT INTO {self.target_table}
SELECT {self.target_table}_upsert.*
FROM {self.target_table}_upsert
ON CONFLICT ({', '.join(['"' + primary_key +'"' for primary_key in self.primary_keys])})
DO UPDATE SET {set_clause}"""
            if not shell.run_shell_command(
                    f'echo {shlex.quote(upsery_query)} \\\n  | ' +
                    mara_db.shell.query_command(self.target_db_alias,
                                                echo_queries=True)):
                return False

        # update data_integration_incremental_copy_status
        incremental_copy_status.update(self.node_path(), self.source_db_alias,
                                       self.source_table,
                                       max_modification_value)
        return True
示例#8
0
    def run():

        # collect system stats in a separate Process
        statistics_process = multiprocessing.Process(
            target=lambda: system_statistics.generate_system_statistics(
                event_queue),
            name='system_statistics')
        statistics_process.start()

        try:
            # capture output of print statements and other unplanned output
            logger.redirect_output(event_queue, pipeline.path())

            # all nodes that have not run yet, ordered by priority
            node_queue: [pipelines.Node] = []

            # data needed for computing cost
            node_durations_and_run_times = node_cost.node_durations_and_run_times(
                pipeline.path())

            # Putting nodes into the node queue
            def queue(nodes: [pipelines.Node]):
                for node in nodes:
                    node_cost.compute_cost(node, node_durations_and_run_times)
                    node_queue.append(node)
                node_queue.sort(key=lambda node: node.cost, reverse=True)

            if nodes:  # only run a set of child nodes

                def with_all_upstreams(nodes: {pipelines.Node}):
                    """recursively find all upstreams of a list of nodes"""
                    return functools.reduce(
                        set.union,
                        [with_all_upstreams(node.upstreams)
                         for node in nodes], nodes)

                # when requested, include all upstreams of nodes, otherwise just use provided nodes
                nodes_to_run = with_all_upstreams(
                    set(nodes)) if with_upstreams else set(nodes)

                # remove everything from pipeline that should not be run
                # (that's makes updating dependencies between nodes easier)
                for node in set(pipeline.nodes.values()) - nodes_to_run:
                    pipeline.remove(node)

                # queue remaining nodes
                queue(list((pipeline.nodes).values()))

            else:
                # remove dependencies to siblings
                pipeline.upstreams = set()
                pipeline.downstreams = set()
                # queue whole pipeline
                queue([pipeline])

            # book keeping
            run_start_time = datetime.datetime.now()
            # all nodes that already ran or that won't be run anymore
            processed_nodes: {pipelines.Node} = set()
            # running pipelines with start times and number of running children
            running_pipelines: {
                pipelines.Pipeline: [datetime.datetime, int]
            } = {}
            failed_pipelines: {pipelines.Pipeline
                               } = set()  # pipelines with failed tasks
            running_task_processes: {pipelines.Task: TaskProcess} = {}

            def dequeue() -> pipelines.Node:
                """
                Finds the next task in the queue
                - without upstreams or where all upstreams have been run already
                - where the pipeline specific maximum number of parallel tasks per pipeline is not reached
                """
                for node in node_queue:  # type: pipelines.Node
                    if ((not node.upstreams
                         or len(node.upstreams & processed_nodes) == len(
                             node.upstreams)) and
                        (not isinstance(node.parent, pipelines.Pipeline) or
                         (not node.parent.max_number_of_parallel_tasks) or
                         (not node.parent in running_pipelines) or
                         (running_pipelines[node.parent][1] <
                          node.parent.max_number_of_parallel_tasks))):
                        node_queue.remove(node)
                        if node.parent in failed_pipelines:
                            processed_nodes.add(
                                node
                            )  # if the parent pipeline failed, don't launch new nodes
                        else:
                            return node

            def track_finished_pipelines():
                """when all nodes of a pipeline have been processed, then emit events"""
                for running_pipeline, (start_time, running_children) \
                        in dict(running_pipelines).items():  # type: pipelines.Pipeline
                    if len(
                            set(running_pipeline.nodes.values())
                            & processed_nodes) == len(running_pipeline.nodes):
                        succeeded = running_pipeline not in failed_pipelines
                        event_queue.put(
                            events.Output(
                                node_path=running_pipeline.path(),
                                format=logger.Format.ITALICS,
                                is_error=not succeeded,
                                message=
                                f'{"succeeded" if succeeded else "failed"}, {logger.format_time_difference(run_start_time, datetime.datetime.now())}'
                            ))
                        event_queue.put(
                            events.NodeFinished(
                                node_path=running_pipeline.path(),
                                start_time=start_time,
                                end_time=datetime.datetime.now(),
                                is_pipeline=True,
                                succeeded=succeeded))
                        del running_pipelines[running_pipeline]
                        processed_nodes.add(running_pipeline)

            # announce run start
            event_queue.put(
                events.RunStarted(node_path=pipeline.path(),
                                  start_time=run_start_time,
                                  pid=os.getpid()))

            # run as long
            # - as task processes are still running
            # - as there is still stuff in the node queue
            while running_task_processes or node_queue:
                # don't do anything if the maximum number of parallel tasks is currently running
                if len(running_task_processes
                       ) < config.max_number_of_parallel_tasks():

                    next_node = dequeue(
                    )  # get the next runnable node from the queue

                    if next_node:
                        if isinstance(next_node, pipelines.Pipeline):
                            # connect pipeline nodes without upstreams to upstreams of pipeline
                            for upstream in next_node.upstreams:
                                for pipeline_node in next_node.nodes.values():
                                    if not pipeline_node.upstreams:
                                        next_node.add_dependency(
                                            upstream, pipeline_node)

                            # connect pipeline nodes without downstreams to downstream of pipeline
                            for downstream in next_node.downstreams:
                                for pipeline_node in next_node.nodes.values():
                                    if not pipeline_node.downstreams:
                                        next_node.add_dependency(
                                            pipeline_node, downstream)

                            # queue all child nodes
                            queue(list(next_node.nodes.values()))

                            # book keeping and event emission
                            pipeline_start_time = datetime.datetime.now()
                            running_pipelines[next_node] = [
                                pipeline_start_time, 0
                            ]
                            event_queue.put(
                                events.NodeStarted(next_node.path(),
                                                   pipeline_start_time, True))
                            event_queue.put(
                                events.Output(
                                    node_path=next_node.path(),
                                    format=logger.Format.ITALICS,
                                    message='★ ' + node_cost.format_duration(
                                        node_durations_and_run_times.get(
                                            tuple(next_node.path()),
                                            [0, 0])[0])))

                        elif isinstance(next_node, pipelines.ParallelTask):
                            # create sub tasks and queue them
                            try:
                                logger.redirect_output(event_queue,
                                                       next_node.path())
                                logger.log('☆ Launching tasks',
                                           format=logger.Format.ITALICS)
                                sub_pipeline = next_node.launch()
                                next_node.parent.replace(
                                    next_node, sub_pipeline)
                                queue([sub_pipeline])

                            except Exception as e:
                                logger.log(
                                    message=f'Could not launch parallel tasks',
                                    format=logger.Format.ITALICS,
                                    is_error=True)
                                logger.log(
                                    message=traceback.format_exc(),
                                    format=events.Output.Format.VERBATIM,
                                    is_error=True)
                                failed_pipelines.add(next_node.parent)
                            finally:
                                logger.redirect_output(event_queue,
                                                       pipeline.path())

                        else:
                            # run a task in a subprocess
                            if next_node.parent in running_pipelines:
                                running_pipelines[next_node.parent][1] += 1
                            event_queue.put(
                                events.NodeStarted(next_node.path(),
                                                   datetime.datetime.now(),
                                                   False))
                            event_queue.put(
                                events.Output(
                                    node_path=next_node.path(),
                                    format=logger.Format.ITALICS,
                                    message='★ ' + node_cost.format_duration(
                                        node_durations_and_run_times.get(
                                            tuple(next_node.path()),
                                            [0, 0])[0])))

                            status_queue = multiprocessing.Queue()
                            process = TaskProcess(next_node, event_queue,
                                                  status_queue)
                            process.start()
                            running_task_processes[next_node] = process

                # check whether some of the running processes finished
                for task_process in list(
                        running_task_processes.values()):  # type: TaskProcess
                    if task_process.is_alive():
                        pass
                    else:
                        del running_task_processes[task_process.task]
                        if task_process.task.parent in running_pipelines:
                            running_pipelines[task_process.task.parent][1] -= 1

                        processed_nodes.add(task_process.task)

                        succeeded = not (task_process.status_queue.get()
                                         == False
                                         or task_process.exitcode != 0)
                        if not succeeded:
                            for parent in task_process.task.parents()[:-1]:
                                failed_pipelines.add(parent)

                        end_time = datetime.datetime.now()
                        event_queue.put(
                            events.Output(
                                task_process.task.path(),
                                ('succeeded' if succeeded else 'failed') +
                                ',  ' + logger.format_time_difference(
                                    task_process.start_time, end_time),
                                format=logger.Format.ITALICS,
                                is_error=not succeeded))
                        event_queue.put(
                            events.NodeFinished(task_process.task.path(),
                                                task_process.start_time,
                                                end_time, False, succeeded))

                # check if some pipelines finished
                track_finished_pipelines()

                # don't busy-wait
                time.sleep(0.001)

        except:
            event_queue.put(
                events.Output(node_path=pipeline.path(),
                              message=traceback.format_exc(),
                              format=logger.Format.ITALICS,
                              is_error=True))

        # run again because `dequeue` might have moved more nodes to `finished_nodes`
        track_finished_pipelines()

        # kill the stats process (joining or terminating does not work in gunicorn)
        os.kill(statistics_process.pid, signal.SIGKILL)
        statistics_process.join()

        # run finished
        event_queue.put(
            events.RunFinished(node_path=pipeline.path(),
                               end_time=datetime.datetime.now(),
                               succeeded=not failed_pipelines))
示例#9
0
    def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None:
        files = []  # A list of (file_name, date_or_file_name) tuples
        data_dir = config.data_dir()
        first_date = config.first_date()

        for file in glob.iglob(str(pathlib.Path(data_dir /
                                                self.file_pattern))):
            file = str(pathlib.Path(file).relative_to(pathlib.Path(data_dir)))
            if self.date_regex:
                match = re.match(self.date_regex, file)
                if not match:
                    raise Exception(
                        f'file name "{file}" \ndoes not match date regex "{self.date_regex}"'
                    )
                date = datetime.date(*[int(group) for group in match.groups()])
                if date >= first_date:
                    files.append((file, date))
            else:
                files.append((file, file))

        # sort by date when regex provided or by filename otherwise
        files.sort(key=lambda x: x[1], reverse=True)

        # remove latest file when requested
        if self.read_mode == ReadMode.ONLY_NEW_EXCEPT_LATEST:
            files = files[1:]

        # take only latest file when requested
        if files and len(files) > 0 and self.read_mode == ReadMode.ONLY_LATEST:
            files = files[:1]

        # for incremental loading, determine which files already have been processed
        # reprocess all when file dependencies changed
        if (self.read_mode not in (ReadMode.ALL, ReadMode.ONLY_LATEST) and
            (not self.file_dependencies or not _file_dependencies.is_modified(
                self.path(), 'ParallelReadFile', self.parent.base_path(),
                self.file_dependencies))):
            processed_files = _processed_files.already_processed_files(
                self.path())

            files = [
                x for x in files
                if x[0] not in processed_files  # everything not yet read
                or
                (self.read_mode == ReadMode.ONLY_CHANGED  # everything modified
                 and self._last_modification_timestamp(x[0]) > processed_files[
                     x[0]])
            ]

        if not files:
            logger.log('No newer files', format=logger.Format.ITALICS)
            return

        if self.read_mode != ReadMode.ALL and self.file_dependencies:

            def update_file_dependencies():
                _file_dependencies.update(self.path(), 'ParallelReadFile',
                                          self.parent.base_path(),
                                          self.file_dependencies)
                return True

            sub_pipeline.final_node.commands.append(
                python.RunFunction(update_file_dependencies))

        chunk_size = math.ceil(
            len(files) / (2 * config.max_number_of_parallel_tasks()))

        if self.partition_target_table_by_day_id:
            if not isinstance(mara_db.dbs.db(self.db_alias),
                              mara_db.dbs.PostgreSQLDB):
                raise NotImplementedError(
                    f'Partitioning by day_id has only been implemented for postgresql so far, \n'
                    f'not for {mara_db.postgresql.engine(self.db_alias).name}')
            files_per_day = {}
            for (file, date) in files:
                if date in files_per_day:
                    files_per_day[date].append(file)
                else:
                    files_per_day[date] = [file]

            sql_statement = ''
            for date in files_per_day.keys():
                sql_statement += f'CREATE TABLE IF NOT EXISTS {self.target_table}_{date.strftime("%Y%m%d")}'
                sql_statement += f' ( CHECK (day_id = {date.strftime("%Y%m%d")}) ) INHERITS ({self.target_table});\n'

            create_partitions_task = pipelines.Task(
                id='create_partitions',
                description='Creates required target table partitions',
                commands=[
                    sql.ExecuteSQL(sql_statement=sql_statement,
                                   echo_queries=False,
                                   db_alias=self.db_alias)
                ])

            sub_pipeline.add(create_partitions_task)

            for n, chunk in enumerate(
                    more_itertools.chunked(files_per_day.items(), chunk_size)):
                task = pipelines.Task(
                    id=str(n), description='Reads a portion of the files')
                for (day, files) in chunk:
                    target_table = self.target_table + '_' + day.strftime(
                        "%Y%m%d")
                    for file in files:
                        task.add_commands(self.parallel_commands(file))
                    task.add_command(
                        sql.ExecuteSQL(
                            sql_statement=f'ANALYZE {target_table}'))
                sub_pipeline.add(task, ['create_partitions'])
        else:
            for n, chunk in enumerate(more_itertools.chunked(
                    files, chunk_size)):
                sub_pipeline.add(
                    pipelines.Task(
                        id=str(n),
                        description=f'Reads {len(chunk)} files',
                        commands=sum(
                            [self.parallel_commands(x[0]) for x in chunk],
                            [])))