Пример #1
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        'users',
        'groups',
        'events',
        'friends',
        'pages',
        'fans',
        'event_admins',
        'group_admins',
    ]

    for table in tables:
        yield Materialization(
            label='table_info',
            metadata_entries=[
                EventMetadataEntry.path(label='table_path',
                                        path='/path/to/{}.raw'.format(table))
            ],
        )
        yield ExpectationResult(
            success=True,
            label='{table}.row_count'.format(table=table),
            description='Row count passed for {table}'.format(table=table),
        )
Пример #2
0
 def backcompat_materialize(_):
     yield Materialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry("text", value="text is cool"),
             MetadataEntry(
                 "url", value=MetadataValue.url("https://bigty.pe/neato")),
             MetadataEntry("path",
                           value=MetadataValue.path("/tmp/awesome")),
             MetadataEntry("json", value={"is_dope": True}),
             MetadataEntry(
                 "python class",
                 value=MetadataValue.python_artifact(MetadataEntry)),
             MetadataEntry(
                 "python function",
                 value=MetadataValue.python_artifact(file_relative_path)),
             MetadataEntry("float", value=1.2),
             MetadataEntry("int", value=1),
             MetadataEntry("float NaN", value=float("nan")),
             MetadataEntry("long int", value=LONG_INT),
             MetadataEntry("pipeline run",
                           value=MetadataValue.pipeline_run("fake_run_id")),
             MetadataEntry("my asset", value=AssetKey("my_asset")),
         ],
     )
     yield Output(None)
Пример #3
0
def sort_by_calories(context, cereals):
    sorted_cereals = sorted(
        cereals, key=lambda cereal: int(cereal['calories'])
    )
    context.log.info(
        'Least caloric cereal: {least_caloric}'.format(
            least_caloric=sorted_cereals[0]['name']
        )
    )
    context.log.info(
        'Most caloric cereal: {most_caloric}'.format(
            most_caloric=sorted_cereals[-1]['name']
        )
    )
    fieldnames = list(sorted_cereals[0].keys())
    sorted_cereals_csv_path = os.path.abspath(
        'output/calories_sorted_{run_id}.csv'.format(run_id=context.run_id)
    )
    os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True)
    with open(sorted_cereals_csv_path, 'w') as fd:
        writer = csv.DictWriter(fd, fieldnames)
        writer.writeheader()
        writer.writerows(sorted_cereals)
    yield Materialization(
        label='sorted_cereals_csv',
        description='Cereals data frame sorted by caloric content',
        metadata_entries=[
            EventMetadataEntry.path(
                sorted_cereals_csv_path, 'sorted_cereals_csv_path'
            )
        ],
    )
    yield Output(None)
Пример #4
0
def train_lstm_model(context, training_set: TrainingSet):
    X, y = training_set
    breakpoint = context.solid_config['timeseries_train_test_breakpoint']  # pylint: disable=W0622
    X_train, X_test = X[0:breakpoint], X[breakpoint:]
    y_train, y_test = y[0:breakpoint], y[breakpoint:]

    _, n_steps, n_features = X.shape
    model = Sequential()
    model.add(
        LSTM(
            context.solid_config['lstm_layer_config']['num_recurrant_units'],
            activation=context.solid_config['lstm_layer_config']['activation'],
            input_shape=(n_steps, n_features),
        ))
    model.add(Dense(context.solid_config['num_dense_layers']))
    model.compile(
        optimizer=context.solid_config['model_trainig_config']['optimizer'],
        loss=context.solid_config['model_trainig_config']['loss'],
        metrics=['mae'],
    )
    model.fit(
        X_train,
        y_train,
        epochs=context.solid_config['model_trainig_config']['num_epochs'],
        verbose=0,
    )
    results = model.evaluate(X_test, y_test, verbose=0)
    yield Materialization(
        label='test_set_results',
        metadata_entries=[
            EventMetadataEntry.text(str(results[0]), 'Mean Squared Error'),
            EventMetadataEntry.text(str(results[1]), 'Mean Absolute Error'),
        ],
    )
    yield Output(model)
Пример #5
0
def write_operation_inventory(context: SolidExecutionContext,
                              analysis: Dict[str, RightSizeAnalysis],
                              resources: DataFrame) -> Nothing:
    resources = resources.set_index('resource_id')
    resizes = [{
        'subscription_id': resources.at[resource_id, 'subscription_id'],
        'resource_id': resource_id,
        'current_sku': resources.at[resource_id, 'vm_size'],
        'new_sku': analysis.advisor_sku
    } for resource_id, analysis in analysis.items()
               if analysis.advisor_sku_valid]
    output = {'vm_resize_operations': resizes}

    output_path = os.path.abspath(f'operation_inventory_{context.run_id}.json')
    with open(output_path, 'w') as fd:
        json.dump(output, fd, indent=3)

    yield Materialization(
        label='operation_inventory',
        description=
        'An inventory of the right sizing operations that are recommended and validated.',
        metadata_entries=[
            EventMetadataEntry.path(output_path, 'operation_inventory_path')
        ],
    )
    yield Output(None)
Пример #6
0
 def logs_events(context):
     context.log_event(AssetMaterialization("first"))
     context.log_event(Materialization("second"))
     context.log_event(ExpectationResult(success=True))
     context.log_event(AssetObservation("fourth"))
     yield AssetMaterialization("fifth")
     yield Output("blah")
Пример #7
0
def course_roles(context: SolidExecutionContext,
                 edx_course_ids: List[String]) -> DagsterPath:
    """Retrieve information about user roles for given courses.

    :param context: Dagster execution context for propagaint configuration data
    :type context: SolidExecutionContext

    :param edx_course_ids: List of edX course ID strings
    :type edx_course_ids: List[String]

    :returns: A path definition that points to the rendered data table

    :rtype: DagsterPath
    """
    access_role = Table('student_courseaccessrole')
    roles_query = Query.from_(access_role).select(
        'id', 'user_id', 'org', 'course_id',
        'role').where(access_role.course_id.isin(edx_course_ids))
    query_fields, roles_data = context.resources.sqldb.run_query(roles_query)
    # Maintaining previous file name for compatibility (TMM 2020-05-01)
    roles_path = context.resources.results_dir.path.joinpath('role_query.csv')
    write_csv(query_fields, roles_data, roles_path)
    yield Materialization(
        label='role_query.csv',
        description='Course roles records from Open edX installation',
        metadata_entries=[
            EventMetadataEntry.text(
                label='course_roles_count',
                description='Number of course roles records',
                text=str(len(roles_data))),
            EventMetadataEntry.path(roles_path.name, 'role_query_csv_path')
        ])
    yield Output(roles_path, 'edx_course_roles')
Пример #8
0
def df_output_schema(_context, path, value):
    with open(path, 'w') as fd:
        writer = csv.DictWriter(fd, fieldnames=value[0].keys())
        writer.writeheader()
        writer.writerows(rowdicts=value)

    return Materialization.file(path)
Пример #9
0
def spark_df_output_schema(_context, file_type, file_options, spark_df):
    if file_type == 'csv':
        spark_df.write.csv(file_options['path'],
                           header=file_options.get('header'),
                           sep=file_options.get('sep'))
        return Materialization.file(file_options['path'])
    else:
        check.failed('Unsupported file type: {}'.format(file_type))
Пример #10
0
 def save_to_file_materialization(_, cfg, value):
     with open(cfg['path'], 'w') as ff:
         ff.write(str(value))
         return Materialization(
             'path',
             'Wrote out value to {path}'.format(path=path),
             metadata_entries=[EventMetadataEntry.text('path', path)],
         )
Пример #11
0
def spark_df_materializer(_context, config, spark_df):
    file_type, file_options = list(config.items())[0]

    if file_type == 'csv':
        spark_df.write.csv(**file_options)
        return Materialization.file(file_options['path'])
    elif file_type == 'parquet':
        spark_df.write.parquet(**file_options)
        return Materialization.file(file_options['path'])
    elif file_type == 'json':
        spark_df.write.json(**file_options)
        return Materialization.file(file_options['path'])
    elif file_type == 'jdbc':
        spark_df.write.jdbc(**file_options)
        return Materialization.file(file_options['url'])
    elif file_type == 'orc':
        spark_df.write.orc(**file_options)
        return Materialization.file(file_options['path'])
    elif file_type == 'saveAsTable':
        spark_df.write.saveAsTable(**file_options)
        return Materialization.file(file_options['name'])
    elif file_type == 'text':
        spark_df.write.text(**file_options)
        return Materialization.file(file_options['path'])
    else:
        check.failed('Unsupported file type: {}'.format(file_type))
Пример #12
0
 def raw_file_solid(_context):
     yield Materialization(
         label='table_info',
         metadata_entries=[
             EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(name))
         ],
     )
     yield do_expectation(_context, name)
     yield Output(name)
Пример #13
0
def load_data_to_database_from_spark(context, data_frame):
    context.resources.db_info.load_table(data_frame,
                                         context.solid_config['table_name'])
    # TODO Flow more information down to the client
    # We should be able to flow multiple key value pairs down to dagit
    # See https://github.com/dagster-io/dagster/issues/1408
    yield Materialization(path='Persisted Db Table: {table_name}'.format(
        table_name=context.solid_config['table_name']))
    yield Result(data_frame)
def less_simple_data_frame_output_materialization_config(
        context, config, value):
    # Materialize LessSimpleDataFrame into a csv file
    csv_path = os.path.join(os.path.dirname(__file__),
                            os.path.abspath(config['csv']['path']))
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    with open(csv_path, 'w') as fd:
        fieldnames = list(value[0].keys())
        writer = csv.DictWriter(fd, fieldnames, delimiter=config['csv']['sep'])
        writer.writeheader()
        writer.writerows(value)

    context.log.debug(
        'Wrote dataframe as .csv to {path}'.format(path=csv_path))
    yield Materialization(
        '1data_frame_csv',
        'LessSimpleDataFrame materialized as csv',
        [
            EventMetadataEntry.path(
                path=csv_path,
                label='data_frame_csv_path',
                description='LessSimpleDataFrame written to csv format',
            )
        ],
    )
    # Materialize LessSimpleDataFrame into a json file
    json_path = os.path.abspath(config['json']['path'])
    with open(json_path, 'w') as fd:
        json_value = seven.json.dumps([dict(row) for row in value])
        fd.write(json_value)

    context.log.debug(
        'Wrote dataframe as .json to {path}'.format(path=json_path))
    yield Materialization(
        'data_frame_json',
        'LessSimpleDataFrame materialized as json',
        [
            EventMetadataEntry.path(
                path=json_path,
                label='data_frame_json_path',
                description='LessSimpleDataFrame written to json format',
            )
        ],
    )
Пример #15
0
def insert_into_staging_table(context, records: DataFrame, table_name: str):
    _create_and_load_staging_table(context.resources.postgres_db.engine, table_name, records)
    yield Materialization(
        label=table_name,
        description='Table {} created in database {}'.format(
            table_name, context.resources.postgres_db.db_name
        ),
        metadata_entries=[EventMetadataEntry.text(str(len(records)), "num rows inserted")],
    )
    yield Output(output_name='staging_table', value=table_name)
Пример #16
0
def train_lstm_model_and_upload_to_gcs(context, training_set: TrainingSet,
                                       bucket_name: str):
    from keras.layers import LSTM, Dense
    from keras.models import Sequential

    X, y = training_set
    breakpoint = context.solid_config['timeseries_train_test_breakpoint']  # pylint: disable=W0622
    X_train, X_test = X[0:breakpoint], X[breakpoint:]
    y_train, y_test = y[0:breakpoint], y[breakpoint:]

    _, n_steps, n_features = X.shape
    model = Sequential()
    model.add(
        LSTM(
            context.solid_config['lstm_layer_config']['num_recurrant_units'],
            activation=context.solid_config['lstm_layer_config']['activation'],
            input_shape=(n_steps, n_features),
        ))
    model.add(Dense(context.solid_config['num_dense_layers']))
    model.compile(
        optimizer=context.solid_config['model_trainig_config']['optimizer'],
        loss=context.solid_config['model_trainig_config']['loss'],
        metrics=['mae'],
    )
    model.fit(
        X_train,
        y_train,
        epochs=context.solid_config['model_trainig_config']['num_epochs'],
        verbose=0,
    )
    results = model.evaluate(X_test, y_test, verbose=0)

    # save model and upload
    gcs_bucket = context.resources.gcs_client.get_bucket(bucket_name)
    key = 'model-{}.h5'.format(uuid.uuid4())
    with tempfile.TemporaryFile('w+b') as fp:
        model.save(fp)
        # Done because you can't upload the contents of a file outside the context manager if it's a tempfile.
        fp.seek(0)
        gcs_bucket.blob(key).upload_from_file(fp)

    yield Materialization(
        description='Serialized model to Google Cloud Storage Bucket',
        label='Serialized model and uploaded to gcs',
        metadata_entries=[
            EventMetadataEntry.text(
                'gs://{bucket_name}/{key}'.format(bucket_name=bucket_name,
                                                  key=key),
                'google cloud storage URI',
            ),
            EventMetadataEntry.text(str(results[0]), 'Mean Squared Error'),
            EventMetadataEntry.text(str(results[1]), 'Mean Absolute Error'),
        ],
    )
    yield Output(model)
Пример #17
0
def emit_events_solid(_, input_num):
    a_num = input_num + 1
    a_string = 'foo'
    yield ExpectationResult(
        success=a_num > 0, label='positive', description='A num must be positive'
    )
    yield Materialization(
        label='persisted_string', description='Let us pretend we persisted the string somewhere'
    )
    yield Output(value=a_num, output_name='a_num')
    yield Output(value=a_string, output_name='a_string')
Пример #18
0
 def materialize_one(_):
     yield Materialization(
         label='one',
         asset_key=asset_key,
         metadata_entries=[
             EventMetadataEntry.text('hello', 'text'),
             EventMetadataEntry.json({'hello': 'world'}, 'json'),
             EventMetadataEntry.float(1.0, 'one'),
         ],
     )
     yield Output(1)
Пример #19
0
 def materialize(_):
     yield Materialization(
         label='all_types',
         description='a materialization with all metadata types',
         metadata_entries=[
             EventMetadataEntry.text('text is cool', 'text'),
             EventMetadataEntry.url('https://bigty.pe/neato', 'url'),
             EventMetadataEntry.fspath('/tmp/awesome', 'path'),
             EventMetadataEntry.json({'is_dope': True}, 'json'),
         ],
     )
     yield Output(None)
Пример #20
0
def _base_compute(context):
    time.sleep(context.solid_config['sleep'])

    if random() < context.solid_config['error_rate']:
        raise Exception('blah')

    if context.solid_config.get('materialization_key') is not None:
        metadata_entries = []
        if context.solid_config.get('materialization_text') is not None:
            metadata_entries.append(
                EventMetadataEntry.text(
                    context.solid_config.get('materialization_text'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_url') is not None:
            metadata_entries.append(
                EventMetadataEntry.url(
                    context.solid_config.get('materialization_url'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_path') is not None:
            metadata_entries.append(
                EventMetadataEntry.path(
                    context.solid_config.get('materialization_url'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_json') is not None:
            metadata_entries.append(
                EventMetadataEntry.json(
                    context.solid_config.get('materialization_json'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_value') is not None:
            metadata_entries = [
                EventMetadataEntry.float(
                    context.solid_config.get('materialization_value'), context.solid.name,
                )
            ]

        if len(metadata_entries) == 0:
            metadata_entries = None

        yield Materialization(
            label=context.solid.name,
            asset_key=context.solid_config.get('materialization_key'),
            metadata_entries=metadata_entries,
        )

    yield Output(1)
Пример #21
0
def my_asset_key_materialization_solid(context, df):
    do_some_transform(df)
    persist_to_storage(df)
    yield Materialization(
        asset_key=AssetKey(['dashboard', 'my_cool_site']),
        description='Persisted result to storage',
        metadata_entries=[
            EventMetadataEntry.url('http://mycoolsite.com/dashboard',
                                   label='dashboard_url'),
            EventMetadataEntry.float(calculate_bytes(df), 'size (bytes)'),
        ],
    )
    yield Output(df)
Пример #22
0
def send_to_slack(context, download_data):

    transaction_data = download_data[0]
    block_data = download_data[1]

    transaction_date = transaction_data['transaction_date'][0]
    block_date = block_data['block_date'][0]

    yield ExpectationResult(
        label='dates_match',
        success=transaction_date == block_date,
        metadata_entries=[
            EventMetadataEntry.text(str(transaction_date), 'transaction_date'),
            EventMetadataEntry.text(str(block_date), 'block_date'),
        ],
    )

    date = transaction_date
    dash_transferred = transaction_data['DASH_transferred'][0]
    dash_blocks = block_data['DASH_blocks'][0]
    average_dash_transferred_per_block = float(dash_transferred) / dash_blocks

    yield Materialization(
        label='data',
        metadata_entries=[
            EventMetadataEntry.text(
                '{dash_transferred} dash tranferred'.format(
                    dash_transferred=dash_transferred),
                'dash_transferred',
            ),
            EventMetadataEntry.text(
                '{dash_blocks} dash blocks'.format(dash_blocks=dash_blocks),
                'dash_blocks'),
        ],
    )

    context.resources.slack.chat.post_message(
        channel='#metrics-testing',
        text=
        '{date}\nDash Transferred: {dash_transferred}\nDash blocks: {dash_blocks}\n'
        'Average dash transferred/block: {average_dash_transferred_per_block}'.
        format(
            date=date,
            dash_transferred=dash_transferred,
            dash_blocks=dash_blocks,
            average_dash_transferred_per_block=
            average_dash_transferred_per_block,
        ),
    )

    yield Output(1)
Пример #23
0
def try_parse_create_view(text):
    view_match = CREATE_VIEW_REGEX.search(text)

    if not view_match:
        return None

    return Materialization(
        label='create_view',
        description=text,
        metadata_entries=[
            EventMetadataEntry.text(view_match.group(1), 'schema'),
            EventMetadataEntry.text(view_match.group(2), 'view'),
        ],
    )
Пример #24
0
def many_table_materializations(_context):
    for table in raw_tables:
        yield Materialization(
            label='table_info',
            metadata_entries=[
                EventMetadataEntry.text(text=table, label='table_name'),
                EventMetadataEntry.fspath(path='/path/to/{}'.format(table),
                                          label='table_path'),
                EventMetadataEntry.json(data={'name': table},
                                        label='table_data'),
                EventMetadataEntry.url(url='https://bigty.pe/{}'.format(table),
                                       label='table_name_big'),
            ],
        )
Пример #25
0
 def materialization_and_expectation(_context):
     yield Materialization(
         label='all_types',
         description='a materialization with all metadata types',
         metadata_entries=[
             EventMetadataEntry.text('text is cool', 'text'),
             EventMetadataEntry.url('https://bigty.pe/neato', 'url'),
             EventMetadataEntry.fspath('/tmp/awesome', 'path'),
             EventMetadataEntry.json({'is_dope': True}, 'json'),
         ],
     )
     yield ExpectationResult(success=True, label='row_count', description='passed')
     yield ExpectationResult(True)
     yield Output(True)
Пример #26
0
def write_html_report(context: SolidExecutionContext,
                      report_notebook: FileHandle) -> Nothing:
    with context.file_manager.read(report_notebook) as node_file:
        node = nbformat.read(node_file, nbformat.NO_CONVERT)
    html = convert_nodebook_node_to_html(node, full_width=True)
    handle = context.file_manager.write_data(html.encode(), ext='html')
    yield Materialization(
        label='resize_report',
        description=
        'A report of all VMs utilization data and evaulation of the recommendations.',
        metadata_entries=[
            EventMetadataEntry.path(handle.path_desc, 'resize_report_path')
        ],
    )
Пример #27
0
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config['Bucket']
    key = context.solid_config['Key']

    with context.file_manager.read(file_handle, 'rb') as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield Materialization(
            label='file_to_s3',
            metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))],
        )

        yield Output(value=s3_file_handle, output_name='s3_file_handle')
Пример #28
0
def try_parse_create_table(text):
    table_match = CREATE_TABLE_REGEX.search(text)

    if not table_match:
        return None

    return Materialization(
        label='create_table',
        description=text,
        metadata_entries=[
            EventMetadataEntry.text(table_match.group(1), 'schema'),
            EventMetadataEntry.text(table_match.group(2), 'table'),
            EventMetadataEntry.text(table_match.group(3), 'row_count'),
        ],
    )
Пример #29
0
def less_simple_data_frame_output_materialization_config(
        context, config, value):
    csv_path = os.path.abspath(config['csv']['path'])
    with open(csv_path, 'w') as fd:
        fieldnames = list(value[0].keys())
        writer = csv.DictWriter(fd, fieldnames, delimiter=config['csv']['sep'])
        writer.writeheader()
        writer.writerows(value)
    context.log.debug(
        'Wrote dataframe as .csv to {path}'.format(path=csv_path))
    return Materialization(
        'data_frame_csv',
        'LessSimpleDataFrame materialized as csv',
        [EventMetadataEntry.path(csv_path, 'data_frame_csv_path')],
    )
Пример #30
0
def load_data_to_database_from_spark(context, data_frame: DataFrame):
    context.resources.db_info.load_table(data_frame, context.solid_config['table_name'])

    table_name = context.solid_config['table_name']
    yield Materialization(
        label='Table: {table_name}'.format(table_name=table_name),
        description=(
            'Persisted table {table_name} in database configured in the db_info resource.'
        ).format(table_name=table_name),
        metadata_entries=[
            EventMetadataEntry.text(label='Host', text=context.resources.db_info.host),
            EventMetadataEntry.text(label='Db', text=context.resources.db_info.db_name),
        ],
    )
    yield Output(value=table_name, output_name='table_name')