def run(self): # redirect stdout and stderr to queue logger.redirect_output(self.event_queue, self.task.path()) succeeded = True attempt = 0 try: while True: if not self.task.run(): if attempt < self.task.max_retries: attempt += 1 delay = pow(2, attempt + 2) logger.log( message= f'Retry {attempt}/{self.task.max_retries} in {delay} seconds', is_error=True, format=logger.Format.ITALICS) time.sleep(delay) else: succeeded = False break else: break except Exception as e: logger.log(message=traceback.format_exc(), format=logger.Format.VERBATIM, is_error=True) succeeded = False self.status_queue.put(succeeded)
def read_process_output(): for line in process.stdout: output_lines.append(line) logger.log(line, format=logger.Format.VERBATIM) for line in process.stderr: logger.log(line, format=logger.Format.VERBATIM, is_error=True)
def run_shell_command(command: str, log_command: bool = True): """ Runs a command in a bash shell and logs the output of the command in (near)real-time. Args: command: The command to run log_command: When true, then the command itself is logged before execution Returns: Either (in order) - False when the exit code of the command was not 0 - True when there was no output to stdout - The output to stdout, as an array of lines """ if log_command: logger.log(command, format=logger.Format.ITALICS) process = subprocess.Popen(shlex.split(config.bash_command_string()) + ['-c', command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) # keep stdout output output_lines = [] # unfortunately, only file descriptors and the system stream can be passed to # subprocess.Popen(..) (and not custom streams without a file handle). # So in order to see be able to log the output in real-time, we have to # query the output steams of the process from to separate threads def read_process_output(): for line in process.stdout: output_lines.append(line) logger.log(line, format=logger.Format.VERBATIM) for line in process.stderr: logger.log(line, format=logger.Format.VERBATIM, is_error=True) read_output_thread = threading.Thread(target=read_process_output) read_output_thread.start() # wait until the process finishes while process.poll() is None: time.sleep(0.005) read_output_thread.join() exitcode = process.returncode if exitcode != 0: logger.log(f'exit code {exitcode}', is_error=True, format=logger.Format.ITALICS) return False return output_lines or True
def run(self): # redirect stdout and stderr to queue logger.redirect_output(self.event_queue, self.task.path()) succeeded = True try: if not self.task.run(): succeeded = False except Exception as e: logger.log(message=traceback.format_exc(), format=logger.Format.VERBATIM, is_error=True) succeeded = False self.status_queue.put(succeeded)
def run(self): from data_integration import shell from data_integration.logging import logger pipeline_base_directory = self.parent.parent.base_path() excludes = ' --exclude=__init__.py --exclude=\*.md --exclude=\*.pyc' # cd'ing && grepping in . allows us to show short filenames # The "(...) || true" will ensure that we do not get any output if nothing is found shell_command = f'(cd "{pipeline_base_directory}" && egrep --recursive {excludes} "{self.pattern}" .) || true' lines_or_bool = shell.run_shell_command(shell_command) if lines_or_bool is True: return True else: # The || true makes sure we will not get any False logger.log( f"Please don\'t use the pattern '{self.pattern}' in this pipeline. Matching lines:", format=logger.Format.ITALICS) lines = '\n'.join(lines_or_bool) logger.log(f"{lines}", format=logger.Format.ITALICS) return False
def run(self) -> bool: if self.sql_file_name: logger.log(self.sql_file_name, logger.Format.ITALICS) dependency_type = 'ExecuteSQL ' + (self.sql_file_name or self.sql_statement) if self.file_dependencies: assert (self.parent) pipeline_base_path = self.parent.parent.base_path() if not file_dependencies.is_modified(self.node_path(), dependency_type, pipeline_base_path, self.file_dependencies): logger.log('no changes') return True if not super().run(): return False if self.file_dependencies: file_dependencies.update(self.node_path(), dependency_type, pipeline_base_path, self.file_dependencies) return True
def run(self) -> bool: # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp) # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers, # which can be hard for example in the case of SQL Server logger.log(f'get highest modification comparison value', format=logger.Format.ITALICS) max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}' logger.log(max_value_query, format=logger.Format.VERBATIM) result = shell.run_shell_command( f'echo {shlex.quote(max_value_query)} \\\n | ' + mara_db.shell.copy_to_stdout_command(self.source_db_alias)) if not result: return False # be flexible with different output formats: remove the column header & remove whitespace & quotes max_modification_value = ''.join(result).replace('maxval', '').strip().strip('"') logger.log(repr(max_modification_value), format=logger.Format.VERBATIM) # check whether target table is empty target_table_is_empty = True target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1' logger.log(f'check if target table is empty', format=logger.Format.ITALICS) logger.log(target_table_empty_query, format=logger.Format.VERBATIM) with mara_db.postgresql.postgres_cursor_context( self.target_db_alias) as cursor: cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1') target_table_is_empty = not cursor.fetchone() logger.log( f"target table{'' if target_table_is_empty else ' not'} empty", format=logger.Format.ITALICS) # get last comparison value logger.log('get last comparison value', format=logger.Format.ITALICS) last_comparison_value = incremental_copy_status.get_last_comparison_value( self.node_path(), self.source_db_alias, self.source_table) logger.log(repr(last_comparison_value), format=logger.Format.VERBATIM) if target_table_is_empty or not last_comparison_value: # full load logger.log('full (non incremental) copy', logger.Format.ITALICS) if not target_table_is_empty: truncate_query = f'TRUNCATE TABLE {self.target_table}' logger.log(truncate_query, format=logger.Format.VERBATIM) with mara_db.postgresql.postgres_cursor_context( self.target_db_alias) as cursor: cursor.execute(truncate_query) # overwrite the comparison criteria to get everything replace = {self.comparison_value_placeholder: '(1=1)'} complete_copy_command = self._copy_command(self.target_table, replace) if not shell.run_shell_command(complete_copy_command): return False else: # incremental load. First create the table which will contain the delta logger.log('incremental copy, create upsert table', logger.Format.ITALICS) create_upsert_table_query = ( f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n' + f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE' ) if not shell.run_shell_command( f'echo {shlex.quote(create_upsert_table_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias, echo_queries=True)): return False # perform the actual copy replacing the placeholder # with the comparison value from the latest successful execution replace = { self.comparison_value_placeholder: f'({self.modification_comparison} >= \'{last_comparison_value}\')' } if not shell.run_shell_command( self._copy_command(self.target_table + '_upsert', replace)): return False # now the upsert table has to be merged with the target one # retrieve the target table columns to build the SET clause of the upsert query with mara_db.postgresql.postgres_cursor_context( self.target_db_alias) as cursor: retrieve_column_query = f"SELECT attname FROM pg_attribute WHERE attrelid = '{self.target_table}'::REGCLASS AND attnum > 0;" logger.log(retrieve_column_query, format=logger.Format.VERBATIM) cursor.execute(retrieve_column_query) set_clause = ', '.join([ f'"{col[0]}" = EXCLUDED."{col[0]}"' for col in cursor.fetchall() ]) upsery_query = f""" INSERT INTO {self.target_table} SELECT {self.target_table}_upsert.* FROM {self.target_table}_upsert ON CONFLICT ({', '.join(['"' + primary_key +'"' for primary_key in self.primary_keys])}) DO UPDATE SET {set_clause}""" if not shell.run_shell_command( f'echo {shlex.quote(upsery_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias, echo_queries=True)): return False # update data_integration_incremental_copy_status incremental_copy_status.update(self.node_path(), self.source_db_alias, self.source_table, max_modification_value) return True
def run(): # collect system stats in a separate Process statistics_process = multiprocessing.Process( target=lambda: system_statistics.generate_system_statistics( event_queue), name='system_statistics') statistics_process.start() try: # capture output of print statements and other unplanned output logger.redirect_output(event_queue, pipeline.path()) # all nodes that have not run yet, ordered by priority node_queue: [pipelines.Node] = [] # data needed for computing cost node_durations_and_run_times = node_cost.node_durations_and_run_times( pipeline.path()) # Putting nodes into the node queue def queue(nodes: [pipelines.Node]): for node in nodes: node_cost.compute_cost(node, node_durations_and_run_times) node_queue.append(node) node_queue.sort(key=lambda node: node.cost, reverse=True) if nodes: # only run a set of child nodes def with_all_upstreams(nodes: {pipelines.Node}): """recursively find all upstreams of a list of nodes""" return functools.reduce( set.union, [with_all_upstreams(node.upstreams) for node in nodes], nodes) # when requested, include all upstreams of nodes, otherwise just use provided nodes nodes_to_run = with_all_upstreams( set(nodes)) if with_upstreams else set(nodes) # remove everything from pipeline that should not be run # (that's makes updating dependencies between nodes easier) for node in set(pipeline.nodes.values()) - nodes_to_run: pipeline.remove(node) # queue remaining nodes queue(list((pipeline.nodes).values())) else: # remove dependencies to siblings pipeline.upstreams = set() pipeline.downstreams = set() # queue whole pipeline queue([pipeline]) # book keeping run_start_time = datetime.datetime.now() # all nodes that already ran or that won't be run anymore processed_nodes: {pipelines.Node} = set() # running pipelines with start times and number of running children running_pipelines: { pipelines.Pipeline: [datetime.datetime, int] } = {} failed_pipelines: {pipelines.Pipeline } = set() # pipelines with failed tasks running_task_processes: {pipelines.Task: TaskProcess} = {} def dequeue() -> pipelines.Node: """ Finds the next task in the queue - without upstreams or where all upstreams have been run already - where the pipeline specific maximum number of parallel tasks per pipeline is not reached """ for node in node_queue: # type: pipelines.Node if ((not node.upstreams or len(node.upstreams & processed_nodes) == len( node.upstreams)) and (not isinstance(node.parent, pipelines.Pipeline) or (not node.parent.max_number_of_parallel_tasks) or (not node.parent in running_pipelines) or (running_pipelines[node.parent][1] < node.parent.max_number_of_parallel_tasks))): node_queue.remove(node) if node.parent in failed_pipelines: processed_nodes.add( node ) # if the parent pipeline failed, don't launch new nodes else: return node def track_finished_pipelines(): """when all nodes of a pipeline have been processed, then emit events""" for running_pipeline, (start_time, running_children) \ in dict(running_pipelines).items(): # type: pipelines.Pipeline if len( set(running_pipeline.nodes.values()) & processed_nodes) == len(running_pipeline.nodes): succeeded = running_pipeline not in failed_pipelines event_queue.put( events.Output( node_path=running_pipeline.path(), format=logger.Format.ITALICS, is_error=not succeeded, message= f'{"succeeded" if succeeded else "failed"}, {logger.format_time_difference(run_start_time, datetime.datetime.now())}' )) event_queue.put( events.NodeFinished( node_path=running_pipeline.path(), start_time=start_time, end_time=datetime.datetime.now(), is_pipeline=True, succeeded=succeeded)) del running_pipelines[running_pipeline] processed_nodes.add(running_pipeline) # announce run start event_queue.put( events.RunStarted(node_path=pipeline.path(), start_time=run_start_time, pid=os.getpid())) # run as long # - as task processes are still running # - as there is still stuff in the node queue while running_task_processes or node_queue: # don't do anything if the maximum number of parallel tasks is currently running if len(running_task_processes ) < config.max_number_of_parallel_tasks(): next_node = dequeue( ) # get the next runnable node from the queue if next_node: if isinstance(next_node, pipelines.Pipeline): # connect pipeline nodes without upstreams to upstreams of pipeline for upstream in next_node.upstreams: for pipeline_node in next_node.nodes.values(): if not pipeline_node.upstreams: next_node.add_dependency( upstream, pipeline_node) # connect pipeline nodes without downstreams to downstream of pipeline for downstream in next_node.downstreams: for pipeline_node in next_node.nodes.values(): if not pipeline_node.downstreams: next_node.add_dependency( pipeline_node, downstream) # queue all child nodes queue(list(next_node.nodes.values())) # book keeping and event emission pipeline_start_time = datetime.datetime.now() running_pipelines[next_node] = [ pipeline_start_time, 0 ] event_queue.put( events.NodeStarted(next_node.path(), pipeline_start_time, True)) event_queue.put( events.Output( node_path=next_node.path(), format=logger.Format.ITALICS, message='★ ' + node_cost.format_duration( node_durations_and_run_times.get( tuple(next_node.path()), [0, 0])[0]))) elif isinstance(next_node, pipelines.ParallelTask): # create sub tasks and queue them try: logger.redirect_output(event_queue, next_node.path()) logger.log('☆ Launching tasks', format=logger.Format.ITALICS) sub_pipeline = next_node.launch() next_node.parent.replace( next_node, sub_pipeline) queue([sub_pipeline]) except Exception as e: logger.log( message=f'Could not launch parallel tasks', format=logger.Format.ITALICS, is_error=True) logger.log( message=traceback.format_exc(), format=events.Output.Format.VERBATIM, is_error=True) failed_pipelines.add(next_node.parent) finally: logger.redirect_output(event_queue, pipeline.path()) else: # run a task in a subprocess if next_node.parent in running_pipelines: running_pipelines[next_node.parent][1] += 1 event_queue.put( events.NodeStarted(next_node.path(), datetime.datetime.now(), False)) event_queue.put( events.Output( node_path=next_node.path(), format=logger.Format.ITALICS, message='★ ' + node_cost.format_duration( node_durations_and_run_times.get( tuple(next_node.path()), [0, 0])[0]))) status_queue = multiprocessing.Queue() process = TaskProcess(next_node, event_queue, status_queue) process.start() running_task_processes[next_node] = process # check whether some of the running processes finished for task_process in list( running_task_processes.values()): # type: TaskProcess if task_process.is_alive(): pass else: del running_task_processes[task_process.task] if task_process.task.parent in running_pipelines: running_pipelines[task_process.task.parent][1] -= 1 processed_nodes.add(task_process.task) succeeded = not (task_process.status_queue.get() == False or task_process.exitcode != 0) if not succeeded: for parent in task_process.task.parents()[:-1]: failed_pipelines.add(parent) end_time = datetime.datetime.now() event_queue.put( events.Output( task_process.task.path(), ('succeeded' if succeeded else 'failed') + ', ' + logger.format_time_difference( task_process.start_time, end_time), format=logger.Format.ITALICS, is_error=not succeeded)) event_queue.put( events.NodeFinished(task_process.task.path(), task_process.start_time, end_time, False, succeeded)) # check if some pipelines finished track_finished_pipelines() # don't busy-wait time.sleep(0.001) except: event_queue.put( events.Output(node_path=pipeline.path(), message=traceback.format_exc(), format=logger.Format.ITALICS, is_error=True)) # run again because `dequeue` might have moved more nodes to `finished_nodes` track_finished_pipelines() # kill the stats process (joining or terminating does not work in gunicorn) os.kill(statistics_process.pid, signal.SIGKILL) statistics_process.join() # run finished event_queue.put( events.RunFinished(node_path=pipeline.path(), end_time=datetime.datetime.now(), succeeded=not failed_pipelines))
def add_parallel_tasks(self, sub_pipeline: 'pipelines.Pipeline') -> None: files = [] # A list of (file_name, date_or_file_name) tuples data_dir = config.data_dir() first_date = config.first_date() for file in glob.iglob(str(pathlib.Path(data_dir / self.file_pattern))): file = str(pathlib.Path(file).relative_to(pathlib.Path(data_dir))) if self.date_regex: match = re.match(self.date_regex, file) if not match: raise Exception( f'file name "{file}" \ndoes not match date regex "{self.date_regex}"' ) date = datetime.date(*[int(group) for group in match.groups()]) if date >= first_date: files.append((file, date)) else: files.append((file, file)) # sort by date when regex provided or by filename otherwise files.sort(key=lambda x: x[1], reverse=True) # remove latest file when requested if self.read_mode == ReadMode.ONLY_NEW_EXCEPT_LATEST: files = files[1:] # take only latest file when requested if files and len(files) > 0 and self.read_mode == ReadMode.ONLY_LATEST: files = files[:1] # for incremental loading, determine which files already have been processed # reprocess all when file dependencies changed if (self.read_mode not in (ReadMode.ALL, ReadMode.ONLY_LATEST) and (not self.file_dependencies or not _file_dependencies.is_modified( self.path(), 'ParallelReadFile', self.parent.base_path(), self.file_dependencies))): processed_files = _processed_files.already_processed_files( self.path()) files = [ x for x in files if x[0] not in processed_files # everything not yet read or (self.read_mode == ReadMode.ONLY_CHANGED # everything modified and self._last_modification_timestamp(x[0]) > processed_files[ x[0]]) ] if not files: logger.log('No newer files', format=logger.Format.ITALICS) return if self.read_mode != ReadMode.ALL and self.file_dependencies: def update_file_dependencies(): _file_dependencies.update(self.path(), 'ParallelReadFile', self.parent.base_path(), self.file_dependencies) return True sub_pipeline.final_node.commands.append( python.RunFunction(update_file_dependencies)) chunk_size = math.ceil( len(files) / (2 * config.max_number_of_parallel_tasks())) if self.partition_target_table_by_day_id: if not isinstance(mara_db.dbs.db(self.db_alias), mara_db.dbs.PostgreSQLDB): raise NotImplementedError( f'Partitioning by day_id has only been implemented for postgresql so far, \n' f'not for {mara_db.postgresql.engine(self.db_alias).name}') files_per_day = {} for (file, date) in files: if date in files_per_day: files_per_day[date].append(file) else: files_per_day[date] = [file] sql_statement = '' for date in files_per_day.keys(): sql_statement += f'CREATE TABLE IF NOT EXISTS {self.target_table}_{date.strftime("%Y%m%d")}' sql_statement += f' ( CHECK (day_id = {date.strftime("%Y%m%d")}) ) INHERITS ({self.target_table});\n' create_partitions_task = pipelines.Task( id='create_partitions', description='Creates required target table partitions', commands=[ sql.ExecuteSQL(sql_statement=sql_statement, echo_queries=False, db_alias=self.db_alias) ]) sub_pipeline.add(create_partitions_task) for n, chunk in enumerate( more_itertools.chunked(files_per_day.items(), chunk_size)): task = pipelines.Task( id=str(n), description='Reads a portion of the files') for (day, files) in chunk: target_table = self.target_table + '_' + day.strftime( "%Y%m%d") for file in files: task.add_commands(self.parallel_commands(file)) task.add_command( sql.ExecuteSQL( sql_statement=f'ANALYZE {target_table}')) sub_pipeline.add(task, ['create_partitions']) else: for n, chunk in enumerate(more_itertools.chunked( files, chunk_size)): sub_pipeline.add( pipelines.Task( id=str(n), description=f'Reads {len(chunk)} files', commands=sum( [self.parallel_commands(x[0]) for x in chunk], [])))