Python Pipeline 예제들, weaverbird.pipeline.Pipeline Python 예제들

예제 #1

0

파일 보기

파일: test_pipeline.py 프로젝트: pouc/weaverbird

def test_to_dict():
    pipeline = Pipeline(steps=[
        DomainStep(name='domain', domain='foobar'),
        RollupStep(
            name='rollup',
            hierarchy=['a', 'b'],
            aggregations=[
                Aggregation(newcolumns=['a'], aggfunction='sum', columns=['a'])
            ],
        ),
    ])

    actual_dict = pipeline.dict()

    expected_dict = {
        'steps': [
            {
                'domain': 'foobar',
                'name': 'domain'
            },
            {
                'name':
                'rollup',
                'hierarchy': ['a', 'b'],
                'aggregations': [{
                    'new_columns': ['a'],
                    'agg_function': 'sum',
                    'columns': ['a']
                }],
            },
        ]
    }
    assert actual_dict == expected_dict
    assert pipeline == Pipeline(**pipeline.dict())

예제 #2

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_errors(pipeline_executor):
    """
    It should provide helpful information when the pipeline execution fails, such as:
    - the step that encountered an error (nth and type)
    - the original exception message
    """
    with pytest.raises(PipelineExecutionFailure) as excinfo:
        pipeline_executor.execute_pipeline(
            Pipeline(steps=[
                {
                    'name': 'domain',
                    'domain': 'domain_a'
                },
                {
                    'name': 'delete',
                    'columns': ['columnThatDoesNotExist', 'whatever'],
                },
            ]))
    exception_message = excinfo.value.message
    assert 'Step #2' in exception_message
    assert 'delete' in exception_message
    assert 'columnThatDoesNotExist' in exception_message
    assert 'whatever' in exception_message
    assert excinfo.value.details['index'] == 1
    assert excinfo.value.details['message'] == exception_message

예제 #3

0

파일 보기

def test_errors(pipeline_translator, mocker):
    """
    It should provide helpful information when the pipeline translation fails, such as:
    - the step that encountered an error (nth and type)
    - the original exception message
    """
    mocker.patch(
        'weaverbird.backends.sql_translator.steps.filter.apply_condition',
        side_effect=Exception('comparison ' 'not ' 'implemented'),
    )
    with pytest.raises(SQLPipelineTranslationFailure) as trslinfo:
        pipeline_translator(
            Pipeline(
                steps=[
                    {
                        'name': 'filter',
                        'condition': {
                            'column': 'title',
                            'operator': 'eq',
                        },
                    },
                ]
            )
        )
    exception_message = trslinfo.value.message
    assert 'Step #1' in exception_message
    assert 'filter' in exception_message
    assert 'comparison' in exception_message
    assert trslinfo.value.details['index'] == 0
    assert trslinfo.value.details['message'] == exception_message

예제 #4

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_preview_pipeline(mocker: MockFixture, pipeline_executor):
    df_to_json_spy = mocker.spy(pd.DataFrame, 'to_json')
    result = json.loads(
        pipeline_executor.preview_pipeline(
            Pipeline(steps=[
                {
                    'name': 'domain',
                    'domain': 'domain_a'
                },
            ])))
    assert 'data' in result
    assert len(result['data']) == 3  # rows
    assert len(result['data'][0]) == 3  # columns
    assert result['schema']['fields'] == [
        {
            'name': 'colA',
            'type': 'string'
        },
        {
            'name': 'colB',
            'type': 'integer'
        },
        {
            'name': 'colC',
            'type': 'integer'
        },
    ]
    assert result['offset'] == 0
    assert result['limit'] == 50
    assert result['total'] == 3

    # DataFrames must be exported with pandas' method to ensure NaN and dates are correctly converted
    df_to_json_spy.assert_called_once()

예제 #5

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_filter(pipeline_executor):
    df, _ = pipeline_executor.execute_pipeline(
        Pipeline(steps=[
            {
                'name': 'domain',
                'domain': 'domain_a'
            },
            {
                'name': 'filter',
                'condition': {
                    'column': 'colA',
                    'operator': 'eq',
                    'value': 'tutu'
                },
            },
        ]))

    assert_dataframes_equals(
        df,
        pd.DataFrame({
            'colA': ['tutu'],
            'colB': [2],
            'colC': [50]
        }),
    )

예제 #6

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_extract_domain(pipeline_executor: PipelineExecutor):
    df, _ = pipeline_executor.execute_pipeline(
        Pipeline(steps=[{
            'name': 'domain',
            'domain': 'domain_a'
        }]))

    assert_dataframes_equals(df, pd.DataFrame(df_domain_a))

예제 #7

0

파일 보기

def test_report(pipeline_translator):
    _, report = pipeline_translator(
        Pipeline(
            steps=[
                {'name': 'domain', 'domain': 'domain_a'},
            ]
        )
    )
    # there should be one step_report per step in the pipeline
    assert len(report.sql_steps_translation_reports) == 1

예제 #8

0

파일 보기

파일: playground.py 프로젝트: stantoxt/weaverbird

def execute_pipeline(pipeline_steps, **kwargs) -> str:
    executor = PipelineExecutor(lambda domain: DOMAINS[domain])

    # Url parameters are only strings, these two must be understood as numbers
    if 'limit' in kwargs:
        kwargs['limit'] = int(kwargs['limit'])
    if 'offset' in kwargs:
        kwargs['offset'] = int(kwargs['offset'])

    return executor.preview_pipeline(pipeline=Pipeline(steps=pipeline_steps),
                                     **kwargs)

예제 #9

0

파일 보기

def resolve_pipeline_for_combination(
    pipeline: PipelineOrDomainName,
    domain_retriever: DomainRetriever,
    pipeline_executor: PipelineExecutor,
) -> DataFrame:
    """
    Combined pipelines can be either single domains (str), or complete pipeline (list of steps)
    """
    from weaverbird.pipeline import Pipeline

    if isinstance(pipeline, str):
        return domain_retriever(pipeline)
    else:
        return pipeline_executor(Pipeline(steps=pipeline))

예제 #10

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_report(pipeline_executor):
    _, report = pipeline_executor.execute_pipeline(
        Pipeline(steps=[
            {
                'name': 'domain',
                'domain': 'domain_a'
            },
            {
                'name': 'rename',
                'toRename': [['colA', 'col_a'], ['colB', 'col_b']]
            },
        ]))
    # there should be one step_report per step in the pipeline
    assert len(report.steps_reports) == 2

예제 #11

0

파일 보기

def execute_pipeline(pipeline_steps, **kwargs) -> str:
    # Validation
    pipeline = Pipeline(steps=pipeline_steps)

    # Url parameters are only strings, these two must be understood as numbers
    if 'limit' in kwargs:
        kwargs['limit'] = int(kwargs['limit'])
    if 'offset' in kwargs:
        kwargs['offset'] = int(kwargs['offset'])

    return pandas_preview_pipeline(
        pipeline=pipeline,
        domain_retriever=lambda domain: DOMAINS[domain],
        **kwargs,
    )

예제 #12

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_preview_pipeline_limit(pipeline_executor):
    result = pipeline_executor.preview_pipeline(
        Pipeline(steps=[
            {
                'name': 'domain',
                'domain': 'domain_a'
            },
        ]),
        limit=1,
    )
    assert json.loads(result)['data'] == [{
        'colA': 'toto',
        'colB': 1,
        'colC': 100
    }]  # first row of the data frame

예제 #13

0

파일 보기

def test_translation_pipeline(pipeline_translator, mocker):
    query_string, _ = pipeline_translator(
        Pipeline(
            steps=[
                {'name': 'domain', 'domain': 'domain_a'},
                {
                    'name': 'filter',
                    'condition': {'column': 'title', 'operator': 'isnull'},
                },
            ]
        )
    )
    assert (
        query_string
        == 'WITH SELECT_STEP_0 AS (SELECT title FROM books), FILTER_STEP_1 AS (SELECT * FROM SELECT_STEP_0 WHERE title IS NULL) SELECT title FROM FILTER_STEP_1'
    )

예제 #14

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_preview_pipeline_limit_offset(pipeline_executor):
    result = pipeline_executor.preview_pipeline(
        Pipeline(steps=[
            {
                'name': 'domain',
                'domain': 'domain_a'
            },
        ]),
        limit=3,
        offset=2,
    )
    assert json.loads(result)['data'] == [{
        'colA': 'tata',
        'colB': 3,
        'colC': 25
    }  # third row of the data frame
                                          # no other row after that one
                                          ]

예제 #15

0

파일 보기

파일: test_pipeline_executor.py 프로젝트: pouc/weaverbird

def test_rename(pipeline_executor):
    df, _ = pipeline_executor.execute_pipeline(
        Pipeline(steps=[
            {
                'name': 'domain',
                'domain': 'domain_a'
            },
            {
                'name': 'rename',
                'toRename': [['colA', 'col_a'], ['colB', 'col_b']]
            },
        ]))

    assert_dataframes_equals(
        df,
        pd.DataFrame({
            'col_a': ['toto', 'tutu', 'tata'],
            'col_b': [1, 2, 3],
            'colC': [100, 50, 25]
        }),
    )

예제 #16

0

파일 보기

def test_pandas_execute_pipeline(case_id, case_spec_file_path):
    spec_file = open(case_spec_file_path, 'r')
    spec = json.loads(spec_file.read())
    spec_file.close()

    df_in = pd.read_json(json.dumps(spec['input']), orient='table')
    df_out = pd.read_json(json.dumps(spec['expected']), orient='table')
    dfs_in_others = {
        k: pd.read_json(json.dumps(v), orient='table')
        for (k, v) in spec.get('other_inputs', {}).items()
    }

    pipeline = Pipeline(steps=[{
        'name': 'domain',
        'domain': 'in'
    }, spec['step']])
    DOMAINS = {'in': df_in, **dfs_in_others}
    result = execute_pipeline(pipeline,
                              domain_retriever=lambda x: DOMAINS[x])[0]

    assert_dataframes_equals(df_out, result)

예제 #17

0

파일 보기

def test_extract_query(pipeline_translator):
    q, _ = pipeline_translator(Pipeline(steps=[{'name': 'domain', 'domain': 'domain_a'}]))
    assert q == 'WITH SELECT_STEP_0 AS (SELECT title FROM books) SELECT title FROM SELECT_STEP_0'

예제 #18

0

파일 보기

    context: Dict
    expected_result: List


def get_render_variables_test_cases():
    test_cases = []
    globs = glob.glob('./tests/fixtures/fixtures_templating/*.json')
    for file in globs:
        with open(file) as json_file:
            file_content = json.load(json_file)
            for test in file_content:
                case = Case(filename=file, data=test[0], context=test[1], expected_result=test[2])
                test_cases.append(case)
    return test_cases


cases = get_render_variables_test_cases()
ids = map(lambda x: x.filename, cases)


@pytest.mark.parametrize('case', cases, ids=ids)
def test_step_with_variables(case: Case):
    pipeline_with_variables = PipelineWithVariables(**case.data)

    pipeline = pipeline_with_variables.render(
        case.context, renderer=nosql_apply_parameters_to_query
    )

    expected_result = Pipeline(steps=case.expected_result)
    assert pipeline == expected_result