def pipeline_children_table(path: str): """Creates a table that documents all child nodes of a table""" pipeline, __ = pipelines.find_node(path.split('/')) assert (isinstance(pipeline, pipelines.Pipeline)) node_durations_and_run_times = node_cost.node_durations_and_run_times(pipeline.path()) rows = [] for node in pipeline.nodes.values(): [avg_duration, avg_run_time] = node_durations_and_run_times.get(tuple(node.path()), ['', '']) rows.append( _.tr[_.td[_.a(href=views.node_url(node))[node.id.replace('_', '_<wbr>')]], _.td[node.description], _.td[views.format_labels(node)], _.td[node_cost.format_duration(avg_duration)], _.td(style='color:#bbb' if avg_duration == avg_run_time else '')[ node_cost.format_duration(avg_run_time)], _.td[node_cost.format_duration( node_cost.compute_cost(node, node_durations_and_run_times))], _.td[(_.input(class_='pipeline-node-checkbox', type='checkbox', value=node.id, name='ids[]', onchange='runButtons.update()') if config.allow_run_from_web_ui() else '')]]) return \ str(_.script['var runButtons = new PipelineRunButtons();']) \ + str(bootstrap.table(['ID', 'Description', '', 'Avg duration', 'Avg run time', 'Cost', ''], rows)) \ + str(_.script['floatMaraTableHeaders();'])
def run(path, nodes, with_upstreams): """Runs a pipeline or a sub-set of its nodes""" # the pipeline to run path = path.split(',') pipeline, found = pipelines.find_node(path) if not found: print(f'Pipeline {path} not found', file=sys.stderr) sys.exit(-1) if not isinstance(pipeline, pipelines.Pipeline): print( f'Node {path} is not a pipeline, but a {pipeline.__class__.__name__}', file=sys.stderr) sys.exit(-1) # a list of nodes to run selectively in the pipeline _nodes = set() for id in (nodes.split(',') if nodes else []): node = pipeline.nodes.get(id) if not node: print(f'Node "{id}" not found in pipeline {path}', file=sys.stderr) sys.exit(-1) else: _nodes.add(node) if not run_pipeline(pipeline, _nodes, with_upstreams): sys.exit(-1)
def node_page(path: str): """Creates a node visualization page including title, action buttons, etc.""" node, found = pipelines.find_node(path.split('/')) if not found and node: return flask.redirect(views.node_url(node), 302) elif not node: flask.abort(404, f'Node "{path}" not found') title = [node.__class__.__name__, ' ', [[_.a(href=views.node_url(parent))[parent.id], ' / '] for parent in node.parents()[1:-1]], node.id] if node.parent else 'Data Integration' return response.Response( title=title, action_buttons=action_buttons(node) if config.allow_run_from_web_ui() else [], html=[_.script[''' var nodePage = null; document.addEventListener('DOMContentLoaded', function() { nodePage = NodePage("''' + flask.url_for('data_integration.node_page', path='') + '''"); });'''], dependency_graph.card(node), run_time_chart.card(node), node_content(node), last_runs.card(node)], js_files=['https://www.gstatic.com/charts/loader.js', flask.url_for('data_integration.static', filename='node-page.js'), flask.url_for('data_integration.static', filename='utils.js'), flask.url_for('data_integration.static', filename='run-time-chart.js'), flask.url_for('data_integration.static', filename='system-stats-chart.js'), flask.url_for('data_integration.static', filename='timeline-chart.js'), flask.url_for('data_integration.static', filename='kolorwheel.js')], css_files=[flask.url_for('data_integration.static', filename='common.css'), flask.url_for('data_integration.static', filename='node-page.css'), flask.url_for('data_integration.static', filename='timeline-chart.css')])
def run_time_chart(path: str): node, found = pipelines.find_node(path.split('/')) if not found: flask.abort(404, f'Node "{path}" not found') query = (pathlib.Path(__file__).parent / 'run_time_chart.sql').read_text() with mara_db.postgresql.postgres_cursor_context( 'mara') as cursor: # type: psycopg2.extensions.cursor cursor.execute(query) cursor.execute( f'SELECT row_to_json(t) FROM pg_temp.node_run_times({"%s"}) t', (node.path(), )) rows = [row[0] for row in cursor.fetchall()] if rows and len(rows) > 1: number_of_child_runs = len( rows[0]['child_runs']) if rows[0]['child_runs'] else 0 return str(_.div[ _.div(id='run-time-chart', class_='google-chart', style=f'height:{100+15*number_of_child_runs}px')[' '], _.script[f''' drawRunTimeChart('run-time-chart', '{path}', {json.dumps(rows)}); ''']]) else: return str(_.i(style='color:#888')['Not enough data'])
def reset_incremental_processing(path): """Reset status of incremental processing for a node""" path = path.split(',') if path else [] node, found = pipelines.find_node(path) if not found: print(f'Node {path} not found', file=sys.stderr) sys.exit(-1) reset.reset_incremental_processing(path)
def do_run(path: str, with_upstreams: bool, ids: str): if not config.allow_run_from_web_ui(): flask.abort( 403, 'Running piplelines from web ui is disabled for this instance') pipeline, found = pipelines.find_node(path.split('/')) if not found: flask.abort(404, f'Pipeline "{path}" not found') nodes = {pipeline.nodes[id] for id in (ids.split('/') if ids else [])} def process_events(): for event in execution.run_pipeline(pipeline, nodes, with_upstreams): yield f'event: {event.__class__.__name__}\ndata: ' + event.to_json( ) + '\n\n' return flask.Response(process_events(), mimetype="text/event-stream")
def timeline_chart(path: str, run_id: int): node, __ = pipelines.find_node(path.split('/')) run_id = run_id or _latest_run_id(node.path()) if not run_id: return '' with mara_db.postgresql.postgres_cursor_context( 'mara') as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' SELECT node_path, start_time, end_time, succeeded, is_pipeline FROM data_integration_node_run WHERE node_path [1 :{'%(level)s'}] = {'%(node_path)s'} AND array_length(node_path, 1) > {'%(level)s'} AND run_id = {'%(run_id)s'};''', { 'level': len(node.path()), 'node_path': node.path(), 'run_id': run_id }) nodes = [{ 'label': ' / '.join(node_path[len(node.path()):]), 'status': 'succeeded' if succeeded else 'failed', 'type': 'pipeline' if is_pipeline else 'task', 'url': flask.url_for('data_integration.node_page', path='/'.join(node_path)), 'start': start_time.isoformat(), 'end': end_time.isoformat() } for node_path, start_time, end_time, succeeded, is_pipeline in cursor.fetchall()] if nodes: return str(_.script[ f"drawTimelineChart('timeline-chart', {json.dumps(nodes)})"]) else: return ''
def run_output(path: str, run_id: int, limit: bool): """ Returns the output of a node and its children as html Args: path: The path of the node run_id: The id of the run to return. If None, then the latest run is returned Returns: A <div class="run-output">..</div> element """ node, __ = pipelines.find_node(path.split('/')) run_id = run_id or _latest_run_id(node.path()) if not run_id: return '' line_limit = 1000 with mara_db.postgresql.postgres_cursor_context( 'mara') as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' SELECT node_path, message, format, is_error FROM data_integration_node_run JOIN data_integration_node_output USING (node_run_id) WHERE node_path [1:{"%s"}] = %s AND run_id = %s ORDER BY timestamp ''' + ('LIMIT ' + str(line_limit + 1) if limit else ''), (len(node.path()), node.path(), run_id)) rows = cursor.fetchall() return str(_.script[f""" nodePage.showOutput({json.dumps(rows[:line_limit] if limit else rows)}, "{path}", {'true' if len(rows) == line_limit + 1 else 'false'}); """])
def last_runs_selector(path: str): """ Returns a html select element for selecting among the last runs of a node Args: path: The path of the node Returns: A `<select..><option ../><option ../></select>` element """ node, __ = pipelines.find_node(path.split('/')) with mara_db.postgresql.postgres_cursor_context( 'mara') as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' SELECT run_id, to_char(start_time, 'Mon DD HH24:MI') AS start_time, extract(EPOCH FROM (end_time - start_time)) AS duration, succeeded FROM data_integration_node_run WHERE node_path = {"%s"} ORDER BY run_id DESC;''', (node.path(), )) return str( _.select(id='last-runs-selector', class_='custom-select', style="border:none", onchange=f"nodePage.switchRun(this.value, '{path}')") [[ _.option(value=str(run_id)) [f'{start_time} ({node_cost.format_duration(duration)}, {"succeeded" if succeeded else "failed"})'] for run_id, start_time, duration, succeeded in cursor.fetchall() ]])
def system_stats(path: str, run_id: int): node, __ = pipelines.find_node(path.split('/')) run_id = run_id or _latest_run_id(node.path()) if not run_id: return '' with mara_db.postgresql.postgres_cursor_context( 'mara') as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' SELECT data_integration_system_statistics.* FROM data_integration_system_statistics JOIN data_integration_node_run ON timestamp BETWEEN start_time AND end_time WHERE run_id = {"%s"} AND node_path = {"%s"};''', (run_id, node.path())) data = [[row[0].isoformat()] + list(row[1:]) for row in cursor.fetchall()] if len(data) >= 15: return str(_.div(id='system-stats-chart', class_='google-chart')[' ']) \ + str(_.script[f'nodePage.showSystemStats({json.dumps(data)});']) else: return ''
def dependency_graph(path: str): node, found = pipelines.find_node(path.split('/')) if not found: flask.abort(404, f'Node "{path}" not found') return dependency_graph(node)
def run_page(path: str, with_upstreams: bool, ids: str): if not config.allow_run_from_web_ui(): flask.abort( 403, 'Running piplelines from web ui is disabled for this instance') # the pipeline to run pipeline, found = pipelines.find_node(path.split('/')) if not found: flask.abort(404, f'Pipeline "{path}" not found') assert (isinstance(pipeline, pipelines.Pipeline)) # a list of nodes to run selectively in the pipeline nodes = [] for id in (ids.split('/') if ids else []): node = pipeline.nodes.get(id) if not node: flask.abort(404, f'Node "{id}" not found in pipeline "{path}"') else: nodes.append(node) stream_url = flask.url_for('data_integration.do_run', path=path, with_upstreams=with_upstreams, ids=ids) title = [ 'Run ', 'with upstreams ' if with_upstreams else '', ' / '.join([ str(_.a(href=views.node_url(parent))[parent.id]) for parent in pipeline.parents()[1:] ]) ] if nodes: title += [ ' / [', ', '.join([ str(_.a(href=views.node_url(node))[node.id]) for node in nodes ]), ']' ] return response.Response( html=[ _.script[''' document.addEventListener('DOMContentLoaded', function() { processRunEvents(''' + json.dumps( flask.url_for('data_integration.node_page', path='')) + ', ' + json.dumps(stream_url) + ', ' + json.dumps(pipeline.path()) + '''); });'''], _.style[ 'span.action-buttons > * {display:none}'], # hide reload button until run finishes _.div(class_='row') [_.div(class_='col-lg-7')[bootstrap.card(body=_.div( id='main-output-area', class_='run-output')[''])], _.div(class_='col-lg-5 scroll-container')[ bootstrap. card(header_left='Timeline', body=[ _.div(id='system-stats-chart', class_='google-chart' )[' '], _.div(id='timeline-chart')[' '] ]), _.div(id='failed-tasks-container')[''], _.div(id='running-tasks-container')[''], _.div(id='succeeded-tasks-container')[''], bootstrap.card(id='card-template', header_left=' ', header_right=' ', body=[_.div(class_='run-output')['']])]] ], js_files=[ 'https://www.gstatic.com/charts/loader.js', flask.url_for('data_integration.static', filename='timeline-chart.js'), flask.url_for('data_integration.static', filename='system-stats-chart.js'), flask.url_for('data_integration.static', filename='utils.js'), flask.url_for('data_integration.static', filename='run-page.js') ], css_files=[ flask.url_for('data_integration.static', filename='timeline-chart.css'), flask.url_for('data_integration.static', filename='run-page.css'), flask.url_for('data_integration.static', filename='common.css') ], action_buttons=[ response.ActionButton( action='javascript:location.reload()', label='Run again', icon='play', title='Run pipeline again with same parameters as before') ], title=title, )