def many_materializations_and_passing_expectations(_context): tables = [ 'users', 'groups', 'events', 'friends', 'pages', 'fans', 'event_admins', 'group_admins', ] for table in tables: yield Materialization( label='table_info', metadata_entries=[ EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(table)) ], ) yield ExpectationResult( success=True, label='{table}.row_count'.format(table=table), description='Row count passed for {table}'.format(table=table), )
def backcompat_materialize(_): yield Materialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry("text", value="text is cool"), MetadataEntry( "url", value=MetadataValue.url("https://bigty.pe/neato")), MetadataEntry("path", value=MetadataValue.path("/tmp/awesome")), MetadataEntry("json", value={"is_dope": True}), MetadataEntry( "python class", value=MetadataValue.python_artifact(MetadataEntry)), MetadataEntry( "python function", value=MetadataValue.python_artifact(file_relative_path)), MetadataEntry("float", value=1.2), MetadataEntry("int", value=1), MetadataEntry("float NaN", value=float("nan")), MetadataEntry("long int", value=LONG_INT), MetadataEntry("pipeline run", value=MetadataValue.pipeline_run("fake_run_id")), MetadataEntry("my asset", value=AssetKey("my_asset")), ], ) yield Output(None)
def sort_by_calories(context, cereals): sorted_cereals = sorted( cereals, key=lambda cereal: int(cereal['calories']) ) context.log.info( 'Least caloric cereal: {least_caloric}'.format( least_caloric=sorted_cereals[0]['name'] ) ) context.log.info( 'Most caloric cereal: {most_caloric}'.format( most_caloric=sorted_cereals[-1]['name'] ) ) fieldnames = list(sorted_cereals[0].keys()) sorted_cereals_csv_path = os.path.abspath( 'output/calories_sorted_{run_id}.csv'.format(run_id=context.run_id) ) os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True) with open(sorted_cereals_csv_path, 'w') as fd: writer = csv.DictWriter(fd, fieldnames) writer.writeheader() writer.writerows(sorted_cereals) yield Materialization( label='sorted_cereals_csv', description='Cereals data frame sorted by caloric content', metadata_entries=[ EventMetadataEntry.path( sorted_cereals_csv_path, 'sorted_cereals_csv_path' ) ], ) yield Output(None)
def train_lstm_model(context, training_set: TrainingSet): X, y = training_set breakpoint = context.solid_config['timeseries_train_test_breakpoint'] # pylint: disable=W0622 X_train, X_test = X[0:breakpoint], X[breakpoint:] y_train, y_test = y[0:breakpoint], y[breakpoint:] _, n_steps, n_features = X.shape model = Sequential() model.add( LSTM( context.solid_config['lstm_layer_config']['num_recurrant_units'], activation=context.solid_config['lstm_layer_config']['activation'], input_shape=(n_steps, n_features), )) model.add(Dense(context.solid_config['num_dense_layers'])) model.compile( optimizer=context.solid_config['model_trainig_config']['optimizer'], loss=context.solid_config['model_trainig_config']['loss'], metrics=['mae'], ) model.fit( X_train, y_train, epochs=context.solid_config['model_trainig_config']['num_epochs'], verbose=0, ) results = model.evaluate(X_test, y_test, verbose=0) yield Materialization( label='test_set_results', metadata_entries=[ EventMetadataEntry.text(str(results[0]), 'Mean Squared Error'), EventMetadataEntry.text(str(results[1]), 'Mean Absolute Error'), ], ) yield Output(model)
def write_operation_inventory(context: SolidExecutionContext, analysis: Dict[str, RightSizeAnalysis], resources: DataFrame) -> Nothing: resources = resources.set_index('resource_id') resizes = [{ 'subscription_id': resources.at[resource_id, 'subscription_id'], 'resource_id': resource_id, 'current_sku': resources.at[resource_id, 'vm_size'], 'new_sku': analysis.advisor_sku } for resource_id, analysis in analysis.items() if analysis.advisor_sku_valid] output = {'vm_resize_operations': resizes} output_path = os.path.abspath(f'operation_inventory_{context.run_id}.json') with open(output_path, 'w') as fd: json.dump(output, fd, indent=3) yield Materialization( label='operation_inventory', description= 'An inventory of the right sizing operations that are recommended and validated.', metadata_entries=[ EventMetadataEntry.path(output_path, 'operation_inventory_path') ], ) yield Output(None)
def logs_events(context): context.log_event(AssetMaterialization("first")) context.log_event(Materialization("second")) context.log_event(ExpectationResult(success=True)) context.log_event(AssetObservation("fourth")) yield AssetMaterialization("fifth") yield Output("blah")
def course_roles(context: SolidExecutionContext, edx_course_ids: List[String]) -> DagsterPath: """Retrieve information about user roles for given courses. :param context: Dagster execution context for propagaint configuration data :type context: SolidExecutionContext :param edx_course_ids: List of edX course ID strings :type edx_course_ids: List[String] :returns: A path definition that points to the rendered data table :rtype: DagsterPath """ access_role = Table('student_courseaccessrole') roles_query = Query.from_(access_role).select( 'id', 'user_id', 'org', 'course_id', 'role').where(access_role.course_id.isin(edx_course_ids)) query_fields, roles_data = context.resources.sqldb.run_query(roles_query) # Maintaining previous file name for compatibility (TMM 2020-05-01) roles_path = context.resources.results_dir.path.joinpath('role_query.csv') write_csv(query_fields, roles_data, roles_path) yield Materialization( label='role_query.csv', description='Course roles records from Open edX installation', metadata_entries=[ EventMetadataEntry.text( label='course_roles_count', description='Number of course roles records', text=str(len(roles_data))), EventMetadataEntry.path(roles_path.name, 'role_query_csv_path') ]) yield Output(roles_path, 'edx_course_roles')
def df_output_schema(_context, path, value): with open(path, 'w') as fd: writer = csv.DictWriter(fd, fieldnames=value[0].keys()) writer.writeheader() writer.writerows(rowdicts=value) return Materialization.file(path)
def spark_df_output_schema(_context, file_type, file_options, spark_df): if file_type == 'csv': spark_df.write.csv(file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')) return Materialization.file(file_options['path']) else: check.failed('Unsupported file type: {}'.format(file_type))
def save_to_file_materialization(_, cfg, value): with open(cfg['path'], 'w') as ff: ff.write(str(value)) return Materialization( 'path', 'Wrote out value to {path}'.format(path=path), metadata_entries=[EventMetadataEntry.text('path', path)], )
def spark_df_materializer(_context, config, spark_df): file_type, file_options = list(config.items())[0] if file_type == 'csv': spark_df.write.csv(**file_options) return Materialization.file(file_options['path']) elif file_type == 'parquet': spark_df.write.parquet(**file_options) return Materialization.file(file_options['path']) elif file_type == 'json': spark_df.write.json(**file_options) return Materialization.file(file_options['path']) elif file_type == 'jdbc': spark_df.write.jdbc(**file_options) return Materialization.file(file_options['url']) elif file_type == 'orc': spark_df.write.orc(**file_options) return Materialization.file(file_options['path']) elif file_type == 'saveAsTable': spark_df.write.saveAsTable(**file_options) return Materialization.file(file_options['name']) elif file_type == 'text': spark_df.write.text(**file_options) return Materialization.file(file_options['path']) else: check.failed('Unsupported file type: {}'.format(file_type))
def raw_file_solid(_context): yield Materialization( label='table_info', metadata_entries=[ EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(name)) ], ) yield do_expectation(_context, name) yield Output(name)
def load_data_to_database_from_spark(context, data_frame): context.resources.db_info.load_table(data_frame, context.solid_config['table_name']) # TODO Flow more information down to the client # We should be able to flow multiple key value pairs down to dagit # See https://github.com/dagster-io/dagster/issues/1408 yield Materialization(path='Persisted Db Table: {table_name}'.format( table_name=context.solid_config['table_name'])) yield Result(data_frame)
def less_simple_data_frame_output_materialization_config( context, config, value): # Materialize LessSimpleDataFrame into a csv file csv_path = os.path.join(os.path.dirname(__file__), os.path.abspath(config['csv']['path'])) os.makedirs(os.path.dirname(csv_path), exist_ok=True) with open(csv_path, 'w') as fd: fieldnames = list(value[0].keys()) writer = csv.DictWriter(fd, fieldnames, delimiter=config['csv']['sep']) writer.writeheader() writer.writerows(value) context.log.debug( 'Wrote dataframe as .csv to {path}'.format(path=csv_path)) yield Materialization( '1data_frame_csv', 'LessSimpleDataFrame materialized as csv', [ EventMetadataEntry.path( path=csv_path, label='data_frame_csv_path', description='LessSimpleDataFrame written to csv format', ) ], ) # Materialize LessSimpleDataFrame into a json file json_path = os.path.abspath(config['json']['path']) with open(json_path, 'w') as fd: json_value = seven.json.dumps([dict(row) for row in value]) fd.write(json_value) context.log.debug( 'Wrote dataframe as .json to {path}'.format(path=json_path)) yield Materialization( 'data_frame_json', 'LessSimpleDataFrame materialized as json', [ EventMetadataEntry.path( path=json_path, label='data_frame_json_path', description='LessSimpleDataFrame written to json format', ) ], )
def insert_into_staging_table(context, records: DataFrame, table_name: str): _create_and_load_staging_table(context.resources.postgres_db.engine, table_name, records) yield Materialization( label=table_name, description='Table {} created in database {}'.format( table_name, context.resources.postgres_db.db_name ), metadata_entries=[EventMetadataEntry.text(str(len(records)), "num rows inserted")], ) yield Output(output_name='staging_table', value=table_name)
def train_lstm_model_and_upload_to_gcs(context, training_set: TrainingSet, bucket_name: str): from keras.layers import LSTM, Dense from keras.models import Sequential X, y = training_set breakpoint = context.solid_config['timeseries_train_test_breakpoint'] # pylint: disable=W0622 X_train, X_test = X[0:breakpoint], X[breakpoint:] y_train, y_test = y[0:breakpoint], y[breakpoint:] _, n_steps, n_features = X.shape model = Sequential() model.add( LSTM( context.solid_config['lstm_layer_config']['num_recurrant_units'], activation=context.solid_config['lstm_layer_config']['activation'], input_shape=(n_steps, n_features), )) model.add(Dense(context.solid_config['num_dense_layers'])) model.compile( optimizer=context.solid_config['model_trainig_config']['optimizer'], loss=context.solid_config['model_trainig_config']['loss'], metrics=['mae'], ) model.fit( X_train, y_train, epochs=context.solid_config['model_trainig_config']['num_epochs'], verbose=0, ) results = model.evaluate(X_test, y_test, verbose=0) # save model and upload gcs_bucket = context.resources.gcs_client.get_bucket(bucket_name) key = 'model-{}.h5'.format(uuid.uuid4()) with tempfile.TemporaryFile('w+b') as fp: model.save(fp) # Done because you can't upload the contents of a file outside the context manager if it's a tempfile. fp.seek(0) gcs_bucket.blob(key).upload_from_file(fp) yield Materialization( description='Serialized model to Google Cloud Storage Bucket', label='Serialized model and uploaded to gcs', metadata_entries=[ EventMetadataEntry.text( 'gs://{bucket_name}/{key}'.format(bucket_name=bucket_name, key=key), 'google cloud storage URI', ), EventMetadataEntry.text(str(results[0]), 'Mean Squared Error'), EventMetadataEntry.text(str(results[1]), 'Mean Absolute Error'), ], ) yield Output(model)
def emit_events_solid(_, input_num): a_num = input_num + 1 a_string = 'foo' yield ExpectationResult( success=a_num > 0, label='positive', description='A num must be positive' ) yield Materialization( label='persisted_string', description='Let us pretend we persisted the string somewhere' ) yield Output(value=a_num, output_name='a_num') yield Output(value=a_string, output_name='a_string')
def materialize_one(_): yield Materialization( label='one', asset_key=asset_key, metadata_entries=[ EventMetadataEntry.text('hello', 'text'), EventMetadataEntry.json({'hello': 'world'}, 'json'), EventMetadataEntry.float(1.0, 'one'), ], ) yield Output(1)
def materialize(_): yield Materialization( label='all_types', description='a materialization with all metadata types', metadata_entries=[ EventMetadataEntry.text('text is cool', 'text'), EventMetadataEntry.url('https://bigty.pe/neato', 'url'), EventMetadataEntry.fspath('/tmp/awesome', 'path'), EventMetadataEntry.json({'is_dope': True}, 'json'), ], ) yield Output(None)
def _base_compute(context): time.sleep(context.solid_config['sleep']) if random() < context.solid_config['error_rate']: raise Exception('blah') if context.solid_config.get('materialization_key') is not None: metadata_entries = [] if context.solid_config.get('materialization_text') is not None: metadata_entries.append( EventMetadataEntry.text( context.solid_config.get('materialization_text'), context.solid.name, ) ) if context.solid_config.get('materialization_url') is not None: metadata_entries.append( EventMetadataEntry.url( context.solid_config.get('materialization_url'), context.solid.name, ) ) if context.solid_config.get('materialization_path') is not None: metadata_entries.append( EventMetadataEntry.path( context.solid_config.get('materialization_url'), context.solid.name, ) ) if context.solid_config.get('materialization_json') is not None: metadata_entries.append( EventMetadataEntry.json( context.solid_config.get('materialization_json'), context.solid.name, ) ) if context.solid_config.get('materialization_value') is not None: metadata_entries = [ EventMetadataEntry.float( context.solid_config.get('materialization_value'), context.solid.name, ) ] if len(metadata_entries) == 0: metadata_entries = None yield Materialization( label=context.solid.name, asset_key=context.solid_config.get('materialization_key'), metadata_entries=metadata_entries, ) yield Output(1)
def my_asset_key_materialization_solid(context, df): do_some_transform(df) persist_to_storage(df) yield Materialization( asset_key=AssetKey(['dashboard', 'my_cool_site']), description='Persisted result to storage', metadata_entries=[ EventMetadataEntry.url('http://mycoolsite.com/dashboard', label='dashboard_url'), EventMetadataEntry.float(calculate_bytes(df), 'size (bytes)'), ], ) yield Output(df)
def send_to_slack(context, download_data): transaction_data = download_data[0] block_data = download_data[1] transaction_date = transaction_data['transaction_date'][0] block_date = block_data['block_date'][0] yield ExpectationResult( label='dates_match', success=transaction_date == block_date, metadata_entries=[ EventMetadataEntry.text(str(transaction_date), 'transaction_date'), EventMetadataEntry.text(str(block_date), 'block_date'), ], ) date = transaction_date dash_transferred = transaction_data['DASH_transferred'][0] dash_blocks = block_data['DASH_blocks'][0] average_dash_transferred_per_block = float(dash_transferred) / dash_blocks yield Materialization( label='data', metadata_entries=[ EventMetadataEntry.text( '{dash_transferred} dash tranferred'.format( dash_transferred=dash_transferred), 'dash_transferred', ), EventMetadataEntry.text( '{dash_blocks} dash blocks'.format(dash_blocks=dash_blocks), 'dash_blocks'), ], ) context.resources.slack.chat.post_message( channel='#metrics-testing', text= '{date}\nDash Transferred: {dash_transferred}\nDash blocks: {dash_blocks}\n' 'Average dash transferred/block: {average_dash_transferred_per_block}'. format( date=date, dash_transferred=dash_transferred, dash_blocks=dash_blocks, average_dash_transferred_per_block= average_dash_transferred_per_block, ), ) yield Output(1)
def try_parse_create_view(text): view_match = CREATE_VIEW_REGEX.search(text) if not view_match: return None return Materialization( label='create_view', description=text, metadata_entries=[ EventMetadataEntry.text(view_match.group(1), 'schema'), EventMetadataEntry.text(view_match.group(2), 'view'), ], )
def many_table_materializations(_context): for table in raw_tables: yield Materialization( label='table_info', metadata_entries=[ EventMetadataEntry.text(text=table, label='table_name'), EventMetadataEntry.fspath(path='/path/to/{}'.format(table), label='table_path'), EventMetadataEntry.json(data={'name': table}, label='table_data'), EventMetadataEntry.url(url='https://bigty.pe/{}'.format(table), label='table_name_big'), ], )
def materialization_and_expectation(_context): yield Materialization( label='all_types', description='a materialization with all metadata types', metadata_entries=[ EventMetadataEntry.text('text is cool', 'text'), EventMetadataEntry.url('https://bigty.pe/neato', 'url'), EventMetadataEntry.fspath('/tmp/awesome', 'path'), EventMetadataEntry.json({'is_dope': True}, 'json'), ], ) yield ExpectationResult(success=True, label='row_count', description='passed') yield ExpectationResult(True) yield Output(True)
def write_html_report(context: SolidExecutionContext, report_notebook: FileHandle) -> Nothing: with context.file_manager.read(report_notebook) as node_file: node = nbformat.read(node_file, nbformat.NO_CONVERT) html = convert_nodebook_node_to_html(node, full_width=True) handle = context.file_manager.write_data(html.encode(), ext='html') yield Materialization( label='resize_report', description= 'A report of all VMs utilization data and evaulation of the recommendations.', metadata_entries=[ EventMetadataEntry.path(handle.path_desc, 'resize_report_path') ], )
def file_handle_to_s3(context, file_handle): bucket = context.solid_config['Bucket'] key = context.solid_config['Key'] with context.file_manager.read(file_handle, 'rb') as fileobj: context.resources.s3.upload_fileobj(fileobj, bucket, key) s3_file_handle = S3FileHandle(bucket, key) yield Materialization( label='file_to_s3', metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))], ) yield Output(value=s3_file_handle, output_name='s3_file_handle')
def try_parse_create_table(text): table_match = CREATE_TABLE_REGEX.search(text) if not table_match: return None return Materialization( label='create_table', description=text, metadata_entries=[ EventMetadataEntry.text(table_match.group(1), 'schema'), EventMetadataEntry.text(table_match.group(2), 'table'), EventMetadataEntry.text(table_match.group(3), 'row_count'), ], )
def less_simple_data_frame_output_materialization_config( context, config, value): csv_path = os.path.abspath(config['csv']['path']) with open(csv_path, 'w') as fd: fieldnames = list(value[0].keys()) writer = csv.DictWriter(fd, fieldnames, delimiter=config['csv']['sep']) writer.writeheader() writer.writerows(value) context.log.debug( 'Wrote dataframe as .csv to {path}'.format(path=csv_path)) return Materialization( 'data_frame_csv', 'LessSimpleDataFrame materialized as csv', [EventMetadataEntry.path(csv_path, 'data_frame_csv_path')], )
def load_data_to_database_from_spark(context, data_frame: DataFrame): context.resources.db_info.load_table(data_frame, context.solid_config['table_name']) table_name = context.solid_config['table_name'] yield Materialization( label='Table: {table_name}'.format(table_name=table_name), description=( 'Persisted table {table_name} in database configured in the db_info resource.' ).format(table_name=table_name), metadata_entries=[ EventMetadataEntry.text(label='Host', text=context.resources.db_info.host), EventMetadataEntry.text(label='Db', text=context.resources.db_info.db_name), ], ) yield Output(value=table_name, output_name='table_name')