Пример #1
0
def test_two_input_solid():
    def transform(_context, inputs):
        num_csv1 = inputs['num_csv1']
        num_csv2 = inputs['num_csv2']
        check.inst_param(num_csv1, 'num_csv1', pd.DataFrame)
        check.inst_param(num_csv2, 'num_csv2', pd.DataFrame)
        num_csv1['sum'] = num_csv1['num1'] + num_csv2['num2']
        return num_csv1

    two_input_solid = _dataframe_solid(
        name='two_input_solid',
        inputs=[
            InputDefinition('num_csv1', dagster_pd.DataFrame),
            InputDefinition('num_csv2', dagster_pd.DataFrame),
        ],
        transform_fn=transform,
    )

    environment = config.Environment(
        solids={
            'load_csv1': config.Solid(
                {'path': script_relative_path('num.csv')}),
            'load_csv2': config.Solid(
                {'path': script_relative_path('num.csv')}),
        })

    pipeline = PipelineDefinition(solids=[
        dagster_pd.load_csv_solid('load_csv1'),
        dagster_pd.load_csv_solid('load_csv2'), two_input_solid
    ],
                                  dependencies={
                                      'two_input_solid': {
                                          'num_csv1':
                                          DependencyDefinition('load_csv1'),
                                          'num_csv2':
                                          DependencyDefinition('load_csv2'),
                                      }
                                  })

    pipeline_result = execute_pipeline(pipeline, environment)
    assert pipeline_result.success

    df = pipeline_result.result_for_solid(
        'two_input_solid').transformed_value()

    # df = get_solid_transformed_value(create_test_context(), two_input_solid, environment)
    assert isinstance(df, pd.DataFrame)
    assert df.to_dict('list') == {
        'num1': [1, 3],
        'num2': [2, 4],
        'sum': [3, 7]
    }
Пример #2
0
def test_rename_input():
    result = execute_pipeline(
        PipelineDefinition(
            solids=[
                dagster_pd.load_csv_solid('load_csv'), sum_table,
                sum_sq_table_renamed_input
            ],
            dependencies={
                'sum_table': {
                    'num_csv': DependencyDefinition('load_csv'),
                },
                sum_sq_table_renamed_input.name: {
                    'sum_table_renamed': DependencyDefinition(sum_table.name),
                },
            },
        ),
        environment=get_num_csv_environment(
            get_load_only_solids_config('load_csv')),
    )

    assert result.success

    expected = {
        'num1': [1, 3],
        'num2': [2, 4],
        'sum': [3, 7],
        'sum_squared': [9, 49],
    }
    solid_result = result.result_for_solid('sum_sq_table_renamed_input')
    assert solid_result.transformed_value().to_dict('list') == expected
Пример #3
0
def execute_transform_in_temp_csv_files(solid_inst):
    load_csv_solid = dagster_pd.load_csv_solid('load_csv')
    to_csv_solid = dagster_pd.to_csv_solid('to_csv')

    key = solid_inst.input_defs[0].name

    pipeline = PipelineDefinition(
        solids=[load_csv_solid, solid_inst, to_csv_solid],
        dependencies={
            solid_inst.name: {
                key: DependencyDefinition('load_csv'),
            },
            'to_csv': {
                'df': DependencyDefinition(solid_inst.name),
            }
        })
    with get_temp_file_name() as temp_file_name:
        result = execute_pipeline(
            pipeline,
            get_num_csv_environment({
                load_csv_solid.name:
                config.Solid({'path': script_relative_path('num.csv')}),
                to_csv_solid.name:
                config.Solid({'path': temp_file_name}),
            }),
        )

        assert result.success

        output_df = pd.read_csv(temp_file_name)

    return output_df
Пример #4
0
def test_pandas_multiple_inputs():
    environment = config.Environment(solids={
        'load_one':
        config.Solid({'path': script_relative_path('num.csv')}),
        'load_two':
        config.Solid({'path': script_relative_path('num.csv')}),
    }, )

    def transform_fn(_context, inputs):
        return inputs['num_csv1'] + inputs['num_csv2']

    double_sum = _dataframe_solid(name='double_sum',
                                  inputs=[
                                      InputDefinition('num_csv1',
                                                      dagster_pd.DataFrame),
                                      InputDefinition('num_csv2',
                                                      dagster_pd.DataFrame),
                                  ],
                                  transform_fn=transform_fn)

    pipeline = PipelineDefinition(
        solids=[
            dagster_pd.load_csv_solid('load_one'),
            dagster_pd.load_csv_solid('load_two'), double_sum
        ],
        dependencies={
            'double_sum': {
                'num_csv1': DependencyDefinition('load_one'),
                'num_csv2': DependencyDefinition('load_two'),
            }
        },
    )

    output_df = execute_pipeline(
        pipeline,
        environment=environment,
    ).result_for_solid('double_sum').transformed_value()

    assert not output_df.empty

    assert output_df.to_dict('list') == {
        'num1': [2, 6],
        'num2': [4, 8],
    }
Пример #5
0
def define_pipeline_two():
    return PipelineDefinition(
        name='pandas_hello_world_two',
        solids=[dagster_pd.load_csv_solid('load_num_csv'), sum_solid],
        dependencies={
            'sum_solid': {
                'num': DependencyDefinition('load_num_csv')
            },
        },
    )
Пример #6
0
def create_diamond_dag():
    load_csv_solid = dagster_pd.load_csv_solid('load_csv')

    num_table_solid = _dataframe_solid(
        name='num_table',
        inputs=[InputDefinition('num_csv', dagster_pd.DataFrame)],
        transform_fn=lambda _context, inputs: inputs['num_csv'],
    )

    def sum_transform(_context, inputs):
        num_df = inputs['num_table']
        sum_df = num_df.copy()
        sum_df['sum'] = num_df['num1'] + num_df['num2']
        return sum_df

    sum_table_solid = _dataframe_solid(
        name='sum_table',
        inputs=[InputDefinition('num_table', dagster_pd.DataFrame)],
        transform_fn=sum_transform,
    )

    def mult_transform(_context, inputs):
        num_table = inputs['num_table']
        mult_table = num_table.copy()
        mult_table['mult'] = num_table['num1'] * num_table['num2']
        return mult_table

    mult_table_solid = _dataframe_solid(
        name='mult_table',
        inputs=[InputDefinition('num_table', dagster_pd.DataFrame)],
        transform_fn=mult_transform,
    )

    def sum_mult_transform(_context, inputs):
        sum_df = inputs['sum_table']
        mult_df = inputs['mult_table']
        sum_mult_table = sum_df.copy()
        sum_mult_table['mult'] = mult_df['mult']
        sum_mult_table['sum_mult'] = sum_df['sum'] * mult_df['mult']
        return sum_mult_table

    sum_mult_table_solid = _dataframe_solid(
        name='sum_mult_table',
        inputs=[
            InputDefinition('sum_table', dagster_pd.DataFrame),
            InputDefinition('mult_table', dagster_pd.DataFrame),
        ],
        transform_fn=sum_mult_transform,
    )

    return (load_csv_solid, num_table_solid, sum_table_solid, mult_table_solid,
            sum_mult_table_solid)
Пример #7
0
def define_success_pipeline():
    return PipelineDefinition(
        name='pandas_hello_world',
        solids=[
            dagster_pd.load_csv_solid('load_num_csv'), sum_solid, sum_sq_solid
        ],
        dependencies={
            'sum_solid': {
                'num': DependencyDefinition('load_num_csv')
            },
            'sum_sq_solid': {
                'sum_df': DependencyDefinition(sum_solid.name),
            },
        },
    )
Пример #8
0
def get_solid_transformed_value(_context, solid_inst, environment):
    pipeline = PipelineDefinition(
        solids=[dagster_pd.load_csv_solid('load_csv'), solid_inst],
        dependencies={
            solid_inst.name: {
                solid_inst.input_defs[0].name:
                DependencyDefinition('load_csv'),
            }
        })

    pipeline_result = execute_pipeline(pipeline, environment)

    execution_result = pipeline_result.result_for_solid(solid_inst.name)

    return execution_result.transformed_value()
Пример #9
0
def define_failure_pipeline():
    return dagster.PipelineDefinition(
        name='pandas_hello_world_fails',
        solids=[
            dagster_pd.load_csv_solid('load_num_csv'),
            sum_solid,
            sum_sq_solid,
            always_fails_solid,
        ],
        dependencies={
            'sum_solid': {
                'num': DependencyDefinition('load_num_csv')
            },
            'sum_sq_solid': {
                'sum_df': DependencyDefinition(sum_solid.name),
            },
            'always_fails_solid': {
                'sum_sq_solid': DependencyDefinition(sum_sq_solid.name),
            }
        })
Пример #10
0
def test_basic_pandas_solid():
    csv_input = InputDefinition('num_csv', dagster_pd.DataFrame)

    def transform(_context, inputs):
        num_csv = inputs['num_csv']
        num_csv['sum'] = num_csv['num1'] + num_csv['num2']
        return num_csv

    single_solid = single_output_transform(
        name='sum_table',
        inputs=[csv_input],
        transform_fn=transform,
        output=OutputDefinition(),
    )

    pipeline = PipelineDefinition(
        solids=[dagster_pd.load_csv_solid('load_csv'), single_solid],
        dependencies={
            single_solid.name: {
                'num_csv': DependencyDefinition('load_csv'),
            }
        })

    pipeline_result = execute_pipeline(
        pipeline,
        environment=get_num_csv_environment(
            get_load_only_solids_config('load_csv')),
    )

    assert pipeline_result.success

    assert pipeline_result.result_for_solid(
        'sum_table').transformed_value().to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7]
        }
Пример #11
0
def run_hello_world(hello_world):
    assert len(hello_world.input_defs) == 1

    pipeline = PipelineDefinition(solids=[
        dagster_pd.load_csv_solid('load_csv'),
        hello_world,
    ],
                                  dependencies={
                                      'hello_world': {
                                          'num_csv':
                                          DependencyDefinition('load_csv'),
                                      },
                                  })

    pipeline_result = execute_pipeline(
        pipeline,
        environment=create_num_csv_environment(),
    )

    result = pipeline_result.result_for_solid('hello_world')

    assert result.success

    assert result.transformed_value().to_dict('list') == {
        'num1': [1, 3],
        'num2': [2, 4],
        'sum': [3, 7],
    }

    pipeline_two = PipelineDefinition(
        solids=[
            dagster_pd.load_csv_solid('load_csv'),
            hello_world,
            dagster_pd.to_csv_solid('to_csv'),
        ],
        dependencies={
            'hello_world': {
                'num_csv': DependencyDefinition('load_csv'),
            },
            'to_csv': {
                'df': DependencyDefinition('hello_world'),
            }
        })

    with get_temp_file_name() as temp_file_name:
        environment = config.Environment(solids={
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            'to_csv':
            config.Solid({
                'path': temp_file_name,
            })
        }, )
        pipeline_result = execute_pipeline(
            pipeline_two,
            environment,
        )

        output_result = pipeline_result.result_for_solid('hello_world')

        assert output_result.success

        assert pd.read_csv(temp_file_name).to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
        }
Пример #12
0
def test_pandas_output_intermediate_csv_files():

    with get_temp_file_names(2) as temp_tuple:
        sum_file, mult_file = temp_tuple  # pylint: disable=E0632

        write_sum_table = dagster_pd.to_csv_solid('write_sum_table')
        write_mult_table = dagster_pd.to_csv_solid('write_mult_table')

        pipeline = create_diamond_pipeline(
            extra_solids=[write_sum_table, write_mult_table],
            extra_dependencies={
                write_sum_table.name: {
                    'df': DependencyDefinition('sum_table'),
                },
                write_mult_table.name: {
                    'df': DependencyDefinition('mult_table'),
                }
            })

        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_sum_table.name:
            config.Solid({'path': sum_file}),
            write_mult_table.name:
            config.Solid({'path': mult_file}),
        })

        subgraph_one_result = execute_pipeline(pipeline,
                                               environment=environment)

        assert len(subgraph_one_result.result_list) == 5

        expected_sum = {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
        }

        assert pd.read_csv(sum_file).to_dict('list') == expected_sum
        sum_table_result = subgraph_one_result.result_for_solid('sum_table')
        assert sum_table_result.transformed_value().to_dict(
            'list') == expected_sum

        expected_mult = {
            'num1': [1, 3],
            'num2': [2, 4],
            'mult': [2, 12],
        }
        assert pd.read_csv(mult_file).to_dict('list') == expected_mult
        mult_table_result = subgraph_one_result.result_for_solid('mult_table')
        assert mult_table_result.transformed_value().to_dict(
            'list') == expected_mult

        injected_solids = {
            'sum_mult_table': {
                'sum_table': dagster_pd.load_csv_solid('load_sum_table'),
                'mult_table': dagster_pd.load_csv_solid('load_mult_table'),
            }
        }

        pipeline_result = execute_pipeline(
            PipelineDefinition.create_sub_pipeline(
                pipeline,
                ['sum_mult_table'],
                ['sum_mult_table'],
                injected_solids,
            ),
            environment=config.Environment(solids={
                'load_sum_table':
                config.Solid({'path': sum_file}, ),
                'load_mult_table':
                config.Solid({'path': mult_file}, ),
            }, ),
        )

        assert pipeline_result.success

        subgraph_two_result_list = pipeline_result.result_list

        assert len(subgraph_two_result_list) == 3
        output_df = pipeline_result.result_for_solid(
            'sum_mult_table').transformed_value()
        assert output_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }