def demo_pipeline(): """Returns a demo pipeline""" from data_integration.commands import bash, python pipeline = Pipeline(id='demo', description='A small pipeline that demonstrates the interplay between pipelines, tasks and commands') pipeline.add(Task(id='ping_localhost', description='Pings localhost', commands=[bash.RunBash('ping -c 3 localhost')])) sub_pipeline = Pipeline(id='sub_pipeline', description='Pings a number of hosts') for host in ['google', 'amazon', 'facebook']: sub_pipeline.add(Task(id=f'ping_{host}', description=f'Pings {host}', commands=[bash.RunBash(f'ping -c 3 {host}.com'), python.RunFunction(lambda: 1)])) sub_pipeline.add_dependency('ping_amazon', 'ping_facebook') sub_pipeline.add(Task(id='ping_foo', description='Pings foo', commands=[bash.RunBash('ping foo')]), ['ping_amazon']) pipeline.add(sub_pipeline, ['ping_localhost']) pipeline.add(Task(id='sleep', description='Sleeps for 2 seconds', commands=[bash.RunBash('sleep 2')]), ['sub_pipeline']) return pipeline
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: source_db = mara_db.dbs.db(self.source_db_alias) target_db = mara_db.dbs.db(self.target_db_alias) assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB)) assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB)) with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: pg_version = cursor.connection.server_version ddl_task = Task( id='create_tables_and_functions', description= 'Re-creates the schema, tables structure and functions on the target db', commands=[ # schema and table structure bash.RunBash( command="(echo 'DROP SCHEMA IF EXISTS " + self.schema_name + " CASCADE;';\\\n" + " pg_dump --username="******" --host=" + source_db.host + " --schema=" + self.schema_name + " --section=pre-data --no-owner --no-privileges " + source_db.database + ") \\\n" + " | " + mara_db.shell.query_command( self.target_db_alias, echo_queries=False) + ' --quiet' ), # function definitions bash.RunBash(command=f'''echo " SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def FROM (SELECT oid, * FROM pg_proc p WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace WHERE pg_proc.pronamespace = pg_namespace.oid AND nspname = '{self.schema_name}'" \\\n''' + " | " + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + " | " + mara_db.shell.query_command(self.target_db_alias, echo_queries=False)) ]) sub_pipeline.add(ddl_task) # copy content of tables number_of_chunks = self.max_number_of_parallel_tasks * 3 table_copy_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_table_copy_chunk = [0] * number_of_chunks table_types = {} with mara_db.postgresql.postgres_cursor_context( self.source_db_alias ) as cursor: # type: psycopg2.extensions.cursor cursor.execute( """ SELECT pg_class.relname AS table, relkind, CASE WHEN relkind = 'f' THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy ELSE pg_total_relation_size(pg_class.oid) END / 1000000.0 AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE nspname = '""" + self.schema_name + """' AND relkind IN ('r', 'f') AND relhassubclass = 'f' ORDER BY size DESC""") for table_name, type, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_table_copy_chunk)), key=current_size_per_table_copy_chunk.__getitem__) current_size_per_table_copy_chunk[smallest_chunk_index] += size table_copy_chunks[smallest_chunk_index].append(table_name) table_types[table_name] = type copy_tasks = [] for i, tables in table_copy_chunks.items(): if tables: task = Task( id=f'copy_tables_{i}', description='Copies table content to the frontend db', commands=[ RunBash( command= f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n' + ' | ' + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + ' | ' + mara_db.shell.copy_from_stdin_command( self.target_db_alias, target_table= f'{self.schema_name}.{table_name}')) for table_name in tables ]) copy_tasks.append(task) sub_pipeline.add(task, upstreams=[ddl_task]) # create indexes index_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_index_chunk = [0] * number_of_chunks with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: cursor.execute(""" SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i' ORDER BY size DESC;""") for ddl, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_index_chunk)), key=current_size_per_index_chunk.__getitem__) current_size_per_index_chunk[smallest_chunk_index] += size index_chunks[smallest_chunk_index].append(ddl) for i, index_statements in index_chunks.items(): if index_statements: index_task = Task( id=f'add_indexes_{i}', description='Re-creates indexes on frontend db', commands=[ ExecuteSQL(sql_statement=statement, db_alias=self.target_db_alias) for statement in index_statements ]) sub_pipeline.add(index_task, upstreams=copy_tasks)