Пример #1
0
def ordering_etl_flow_tasks(*,
                            dry_run: bool = False
                            ) -> Iterator[ExecutorIterationTask]:
    """Prepare flow function to be sent to the queue and executed"""
    # TODO: избавиться от функции, переделать так, чтобы одна функция была для заказа

    from flowmaster.operators.etl.core import ETLOperator
    from flowmaster.operators.etl.policy import ETLNotebook

    for name in iter_active_notebook_filenames():
        validate, text, notebook_dict, notebook, error = get_notebook(name)
        notebook: ETLNotebook
        if dry_run:
            if notebook.provider != "fakedata":
                continue

        if not validate:
            logger.error("ValidationError: '{}': {}", name, error)
            continue

        work = Work(notebook)
        for start_period, end_period in work.iter_period_for_execute():
            flow = ETLOperator(notebook)
            etl_flow_task = flow.task(start_period,
                                      end_period,
                                      dry_run=dry_run)

            with prepare_items_for_order(flow, start_period, end_period):
                logger.info("Order ETL flow [{}]: {} {}", notebook.name,
                            start_period, end_period)
                yield etl_flow_task
Пример #2
0
def test_flow(ya_metrika_logs_to_csv_notebook):
    from flowmaster.operators.etl.dataschema import ExportContext
    from flowmaster.operators.etl.providers import Providers
    from flowmaster.operators.etl.core import ETLOperator
    from flowmaster.operators.etl.enums import DataOrient

    def export_func(start_period, end_period,
                    **kwargs) -> Iterator[tuple[dict, list, list]]:
        yield ExportContext(columns=["col1"],
                            data=[[start_period]],
                            data_orient=DataOrient.values)
        yield ExportContext(columns=["col1"],
                            data=[[end_period]],
                            data_orient=DataOrient.values)

    Providers.YandexMetrikaLogsProvider.export_class.__call__ = Mock(
        side_effect=export_func)

    ya_metrika_logs_to_csv_notebook.load.file_name = f"{test_flow.__name__}.tsv"
    ya_metrika_logs_to_csv_notebook.load.with_columns = True

    flow = ETLOperator(ya_metrika_logs_to_csv_notebook)
    flow.dry_run(start_period=dt.datetime(2021, 1, 1),
                 end_period=dt.datetime(2021, 1, 2))
    list(
        flow.task(start_period=dt.datetime(2021, 1, 1),
                  end_period=dt.datetime(2021, 1, 2)))
def test_flow_flowmaster_items(flowmasterdata_items_to_csv_notebook):
    etl_flow = ETLOperator(flowmasterdata_items_to_csv_notebook)
    task = etl_flow.task(dt.datetime(2021, 2, 5), dt.datetime(2021, 2, 5))
    list(task)

    with etl_flow.Load.open_file(mode="r") as loadfile:
        data = loadfile.readlines()

    count_items = len([
        row for row in data if flowmasterdata_items_to_csv_notebook.name in row
    ])

    assert count_items == 1
Пример #4
0
    def order_task(*args, **kwargs) -> Iterator[ExecutorIterationTask]:
        worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_task)]
        pools.append_pools({"two": pool_size})

        for worktime in worktimes:
            ya_metrika_logs_to_csv_notebook.load.file_name = (
                f"{test_pools.__name__}.tsv")
            ya_metrika_logs_to_csv_notebook.export.pools = ["two"]

            flow = ETLOperator(ya_metrika_logs_to_csv_notebook)
            task = flow.task(start_period=worktime, end_period=worktime)

            yield task
Пример #5
0
    def order_task(*args, **kwargs) -> Iterator[ExecutorIterationTask]:
        worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_task)]

        for worktime in worktimes:
            ya_metrika_logs_to_csv_notebook.load.file_name = (
                f"{test_concurrency.__name__}.tsv")
            ya_metrika_logs_to_csv_notebook.work.concurrency = concurrency
            ya_metrika_logs_to_csv_notebook.export.concurrency = 4
            ya_metrika_logs_to_csv_notebook.transform.concurrency = 4
            ya_metrika_logs_to_csv_notebook.load.concurrency = 4

            flow = ETLOperator(ya_metrika_logs_to_csv_notebook)
            yield flow.task(start_period=worktime, end_period=worktime)