Exemplo n.º 1
0
    def test_local_cluster_adapt(self):
        adapt_kwargs = {"minimum": 1, "maximum": 1}
        called_with = None

        class MyCluster(distributed.LocalCluster):
            def adapt(self, **kwargs):
                nonlocal called_with
                called_with = kwargs
                super().adapt(**kwargs)

        executor = DaskExecutor(
            cluster_class=MyCluster,
            cluster_kwargs={
                "processes": False,
                "n_workers": 0
            },
            adapt_kwargs=adapt_kwargs,
        )

        assert executor.adapt_kwargs == adapt_kwargs

        with executor.start():
            res = executor.wait(executor.submit(lambda x: x + 1, 1))
            assert res == 2

        assert called_with == adapt_kwargs
Exemplo n.º 2
0
    def test_start_local_cluster(self, caplog):
        executor = DaskExecutor(cluster_kwargs={"processes": False})
        assert executor.cluster_class == distributed.LocalCluster
        assert executor.cluster_kwargs == {
            "processes": False,
            "silence_logs": logging.CRITICAL,
        }

        with executor.start():
            res = executor.wait(executor.submit(lambda x: x + 1, 1))
            assert res == 2

        assert any(
            "Creating a new Dask cluster" in rec.message for rec in caplog.records
        )
        try:
            import bokeh  # noqa
        except Exception:
            # If bokeh isn't installed, no dashboard will be started
            pass
        else:
            assert any(
                "The Dask dashboard is available at" in rec.message
                for rec in caplog.records
            )
Exemplo n.º 3
0
 def test_executor_disables_watch_worker_events_with_false(self):
     with distributed.Client(n_workers=1,
                             processes=False,
                             set_as_default=False) as client:
         executor = DaskExecutor(address=client.scheduler.address,
                                 watch_worker_status=False)
         with executor.start():
             assert executor.watch_worker_status is False
             assert executor._watch_dask_events_task is None
Exemplo n.º 4
0
    def test_start_local_cluster(self):
        executor = DaskExecutor(cluster_kwargs={"processes": False})
        assert executor.cluster_class == distributed.LocalCluster
        assert executor.cluster_kwargs == {
            "processes": False,
            "silence_logs": logging.CRITICAL,
        }

        with executor.start():
            res = executor.wait(executor.submit(lambda x: x + 1, 1))
            assert res == 2
Exemplo n.º 5
0
    def test_connect_to_running_cluster(self):
        with distributed.Client(processes=False, set_as_default=False) as client:
            executor = DaskExecutor(address=client.scheduler.address)
            assert executor.address == client.scheduler.address
            assert executor.cluster_class is None
            assert executor.cluster_kwargs is None
            assert executor.client_kwargs == {"set_as_default": False}

            with executor.start():
                res = executor.wait(executor.submit(lambda x: x + 1, 1))
                assert res == 2
Exemplo n.º 6
0
    def test_prep_dask_kwargs(self):
        executor = DaskExecutor()
        kwargs = executor._prep_dask_kwargs(
            dict(task_name="FISH!", task_tags=["dask-resource:GPU=1"]))
        assert kwargs["key"].startswith("FISH!-")
        assert kwargs["resources"] == {"GPU": 1.0}

        kwargs = executor._prep_dask_kwargs(
            dict(task_name="FISH!",
                 task_tags=["dask-resource:GPU=1"],
                 task_index=1))
        assert kwargs["key"].startswith("FISH!-1-")
Exemplo n.º 7
0
 def test_executor_enables_watch_worker_events_with_true(self):
     with distributed.Client(n_workers=1,
                             processes=False,
                             set_as_default=False) as client:
         executor = DaskExecutor(
             address=client.scheduler.address,
             watch_worker_status=True,
             adapt_kwargs={"maximum": 4},
         )
         with executor.start():
             assert executor.watch_worker_status is True
             assert executor._watch_dask_events_task is not None
Exemplo n.º 8
0
    def test_performance_report(self):
        # should not error
        assert DaskExecutor().performance_report == ""
        assert (DaskExecutor(performance_report_path="not a readable path").
                performance_report == "")

        with tempfile.TemporaryDirectory() as report_dir:
            with open(f"{report_dir}/report.html", "w") as fp:
                fp.write("very advanced report")

            assert (DaskExecutor(
                performance_report_path=f"{report_dir}/report.html").
                    performance_report == "very advanced report")
Exemplo n.º 9
0
    def test_disable_cancellation_event(self, disabled):
        executor = DaskExecutor(cluster_kwargs={"processes": False},
                                disable_cancellation_event=disabled)
        with executor.start():

            # Can run futures either way
            res = executor.wait(executor.submit(lambda x: x + 1, 1))
            assert res == 2

            if disabled:
                assert executor._should_run_event is None
            else:
                assert executor._should_run_event is not None
Exemplo n.º 10
0
def _get_executor(custom_port_range, name="local"):
    # See https://github.com/equinor/ert/pull/2757#discussion_r794368854
    _, port, sock = find_available_port(custom_range=custom_port_range)
    sock.close()  # do this explicitly, not relying on GC

    if name == "local":
        cluster_kwargs = {
            "silence_logs": "debug",
        }
        return LocalDaskExecutor(**cluster_kwargs)
    elif name == "lsf":
        LSFJob._submit_job = _eq_submit_job
        cluster_kwargs = {
            "queue": "mr",
            "project": None,
            "cores": 1,
            "memory": "1GB",
            "use_stdin": True,
            "n_workers": 2,
            "silence_logs": "debug",
            "scheduler_options": {
                "port": port
            },
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.LSFCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    elif name == "pbs":
        cluster_kwargs = {
            "n_workers": 10,
            "queue": "normal",
            "project": "ERT-TEST",
            "local_directory": "$TMPDIR",
            "cores": 1,
            "memory": "32gb",
            "resource_spec": "select=1:ncpus=1:mem=32gb",
            "scheduler_options": {
                "port": port
            },
            "extra": ["--worker-port", "51820:51840"],
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.PBSCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    else:
        raise ValueError(f"Unknown executor name {name}")
Exemplo n.º 11
0
    def test_connect_to_running_cluster(self, caplog):
        with distributed.Client(processes=False, set_as_default=False) as client:
            address = client.scheduler.address
            executor = DaskExecutor(address=address)
            assert executor.address == address
            assert executor.cluster_class is None
            assert executor.cluster_kwargs is None
            assert executor.client_kwargs == {"set_as_default": False}

            with executor.start():
                res = executor.wait(executor.submit(lambda x: x + 1, 1))
                assert res == 2

        exp = f"Connecting to an existing Dask cluster at {address}"
        assert any(exp in rec.message for rec in caplog.records)
Exemplo n.º 12
0
    def test_executor_logs_worker_events(self, caplog):
        caplog.set_level(logging.DEBUG, logger="prefect")
        with distributed.Client(n_workers=1,
                                processes=False,
                                set_as_default=False) as client:
            executor = DaskExecutor(address=client.scheduler.address)
            with executor.start():
                client.cluster.scale(4)
                while len(client.scheduler_info()["workers"]) < 4:
                    time.sleep(0.1)
                client.cluster.scale(1)
                while len(client.scheduler_info()["workers"]) > 1:
                    time.sleep(0.1)

        assert any("Worker %s added" == rec.msg for rec in caplog.records)
        assert any("Worker %s removed" == rec.msg for rec in caplog.records)
Exemplo n.º 13
0
    def test_cluster_class_and_kwargs(self):
        pytest.importorskip("distributed.deploy.spec")
        executor = DaskExecutor(
            cluster_class="distributed.deploy.spec.SpecCluster",
            cluster_kwargs={"some_kwarg": "some_val"},
            client_kwargs={"set_as_default": True},
        )
        assert executor.cluster_class == distributed.deploy.spec.SpecCluster
        assert executor.cluster_kwargs == {"some_kwarg": "some_val"}
        assert executor.client_kwargs == {"set_as_default": True}

        class TestCluster(object):
            pass

        executor = DaskExecutor(cluster_class=TestCluster)
        assert executor.cluster_class == TestCluster
def run_flow(
        symbols: list,
        tick_type: str,
        start_date: str,
        # end_date: str=(datetime.utcnow() - timedelta(days=1)).isoformat().split('T')[0],  # yesterday utc
        end_date: str = (dt.today() -
                         timedelta(days=1)).isoformat(),  # yesterday local tz
        n_workers: int = (cpu_count(logical=False)),
        threads_per_worker: int = 8,
        processes: bool = False):

    if type(symbols) != list:
        raise ValueError('symbols expects a list type')

    flow = get_flow()

    executor = DaskExecutor(
        cluster_kwargs={
            'n_workers': n_workers,
            'processes': processes,
            'threads_per_worker': threads_per_worker,
        })
    # executor = LocalExecutor()

    flow_state = flow.run(
        executor=executor,
        symbols=symbols,
        tick_type=tick_type,
        start_date=start_date,
        end_date=end_date,
    )
    return flow_state
Exemplo n.º 15
0
    def test_temporary_cluster_forcefully_cancels_pending_tasks(self, tmpdir):
        filname = tmpdir.join("signal")

        def slow():
            time.sleep(10)
            with open(filname, "w") as f:
                f.write("Got here")

        executor = DaskExecutor()
        with executor.start():
            start = time.time()
            fut = executor.submit(slow)  # noqa
            time.sleep(0.1)
        stop = time.time()
        # Cluster shutdown before task could complete
        assert stop - start < 5
        assert not os.path.exists(filname)
Exemplo n.º 16
0
def _get_executor(name="local"):
    if name == "local":
        cluster_kwargs = {
            "silence_logs": "debug",
            "scheduler_options": {
                "port": find_open_port()
            },
        }
        return LocalDaskExecutor(**cluster_kwargs)
    elif name == "lsf":
        LSFJob._submit_job = _eq_submit_job
        cluster_kwargs = {
            "queue": "mr",
            "project": None,
            "cores": 1,
            "memory": "1GB",
            "use_stdin": True,
            "n_workers": 2,
            "silence_logs": "debug",
            "scheduler_options": {
                "port": find_open_port()
            },
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.LSFCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    elif name == "pbs":
        cluster_kwargs = {
            "n_workers": 10,
            "queue": "normal",
            "project": "ERT-TEST",
            "local_directory": "$TMPDIR",
            "cores": 4,
            "memory": "16GB",
            "resource_spec": "select=1:ncpus=4:mem=16GB",
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.PBSCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    else:
        raise ValueError(f"Unknown executor name {name}")
Exemplo n.º 17
0
def mproc():
    "Multi-processing executor using dask distributed"
    with Client(
            processes=True,
            scheduler_port=0,
            dashboard_address=":0",
            n_workers=2,
            threads_per_worker=1,
    ) as client:
        yield DaskExecutor(client.scheduler.address)
Exemplo n.º 18
0
def mthread():
    "Multi-threaded executor using dask distributed"
    with Client(
            processes=False,
            scheduler_port=0,
            dashboard_address=":0",
            n_workers=1,
            threads_per_worker=2,
    ) as client:
        yield DaskExecutor(client.scheduler.address)
Exemplo n.º 19
0
    def test_executor_logs_worker_events(self, caplog):
        caplog.set_level(logging.DEBUG, logger="prefect")
        with distributed.Client(n_workers=1,
                                processes=False,
                                set_as_default=False) as client:
            executor = DaskExecutor(address=client.scheduler.address)
            with executor.start():
                assert executor.watch_worker_status is None
                assert executor._watch_dask_events_task is not None

                time.sleep(0.1)
                client.cluster.scale(4)
                while len(client.scheduler_info()["workers"]) < 4:
                    time.sleep(0.1)
                client.cluster.scale(1)
                while len(client.scheduler_info()["workers"]) > 1:
                    time.sleep(0.1)

        assert any(
            re.match("Worker .+ added", rec.msg) for rec in caplog.records)
        assert any(
            re.match("Worker .+ removed", rec.msg) for rec in caplog.records)
Exemplo n.º 20
0
    def test_exit_early_with_external_or_inproc_cluster_waits_for_pending_futures(
            self, kind, monkeypatch):
        key = "TESTING_%s" % uuid.uuid4().hex

        monkeypatch.setenv(key, "initial")

        def slow():
            time.sleep(0.5)
            os.environ[key] = "changed"

        def pending(x):
            # This function shouldn't ever start, since it's pending when the
            # shutdown signal is received
            os.environ[key] = "changed more"

        if kind == "external":
            with distributed.Client(processes=False,
                                    set_as_default=False) as client:
                executor = DaskExecutor(address=client.scheduler.address)
                with executor.start():
                    fut = executor.submit(slow)
                    fut2 = executor.submit(pending, fut)  # noqa
                    time.sleep(0.2)
                assert os.environ[key] == "changed"

        elif kind == "inproc":
            executor = DaskExecutor(cluster_kwargs={"processes": False})
            with executor.start():
                fut = executor.submit(slow)
                fut2 = executor.submit(pending, fut)  # noqa
                time.sleep(0.2)
            assert os.environ[key] == "changed"

        assert executor.client is None
        assert executor._futures is None
        assert executor._should_run_event is None
Exemplo n.º 21
0
def test_mapreduce_wordcount():
    """Distributed wordcount Flow successfully executes using Dask distributed,
    which is deployed on a Kubernetes cluster. The Flow run's state also
    contains correct word count tuples stored in the state's
    associated Result object.
    """

    url = ('https://raw.githubusercontent.com/KTH/ci-hackathon/master/'
           'installations/ci-poetry/supercollider_src/poet10/poem.txt')
    executor = DaskExecutor(address=DASK_SCHEDULER_ADDR)
    state = mapreduce_wordcount.run(url=url, executor=executor)
    task_ref = mapreduce_wordcount.get_tasks('reducer')[0]
    result = state.result[task_ref].result
    # Get top 3 tokens
    result_top_tokens = sorted(result, key=lambda x: x[1])[-3:]
    expected_top_tokens = [('a', 4), ('and', 4), ('the', 5)]
    assert state.is_successful()
    assert result_top_tokens == expected_top_tokens
Exemplo n.º 22
0
        def execute(rec):
            ex = ExecutorClass()
            pipeline = rec.to_pipelines()
            plan = ex.pipelines_to_plan(pipeline)

            if request.param == "dask":
                client = Client(dask_cluster)

            if request.param == "prefect-dask":
                from prefect.executors import DaskExecutor

                prefect_executor = DaskExecutor(
                    address=dask_cluster.scheduler_address)
                plan.run(executor=prefect_executor)
            else:
                ex.execute_plan(plan)
            if request.param == "dask":
                client.close()
                del client
Exemplo n.º 23
0
    def run(self, flow: "Flow") -> None:
        """
        Run the flow using a temporary dask-kubernetes cluster.

        Args:
            - flow (Flow): the flow to run.
        """
        # Call on_start callback if specified
        if self.on_start:
            self.on_start()

        try:
            from prefect.engine import get_default_flow_runner_class
            from prefect.executors import DaskExecutor
            from dask_kubernetes import KubeCluster

            if self._worker_spec:
                worker_pod = self._worker_spec
                worker_pod = self._populate_worker_spec_yaml(yaml_obj=worker_pod)
            else:
                with open(
                    path.join(path.dirname(__file__), "worker_pod.yaml")
                ) as pod_file:
                    worker_pod = yaml.safe_load(pod_file)
                    worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod)

            cluster = KubeCluster.from_dict(
                worker_pod, namespace=prefect.context.get("namespace")
            )
            cluster.adapt(minimum=self.min_workers, maximum=self.max_workers)

            executor = DaskExecutor(address=cluster.scheduler_address)
            runner_cls = get_default_flow_runner_class()
            runner_cls(flow=flow).run(executor=executor)
        except Exception as exc:
            self.logger.exception(
                "Unexpected error raised during flow run: {}".format(exc)
            )
            raise exc
        finally:
            # Call on_exit callback if specified
            if self.on_exit:
                self.on_exit()
Exemplo n.º 24
0
    def executor(self) -> Executor:

        pod_spec = make_pod_spec(
            image=self._image,
            memory_limit=self._pod_memory_limit,
            memory_request=self._pod_memory_request,
            threads_per_worker=self._pod_threads_per_worker,
            cpu_limit=self._pod_cpu_limit,
            cpu_request=self._pod_cpu_request,
            env=self._generate_env(),
        )
        pod_spec.spec.containers[0].args.extend(["--resources", "TASKSLOTS=1"])

        executor = DaskExecutor(
            cluster_class=lambda: KubeCluster(pod_spec,
                                              deploy_mode=self._deploy_mode),
            adapt_kwargs={
                "minimum": self._adapt_min,
                "maximum": self._adapt_max
            },
        )

        return executor
Exemplo n.º 25
0
    def execute(  # type: ignore
        self, flow: "Flow", **kwargs: Any  # type: ignore
    ) -> None:
        """
        Execute a flow run on a dask-cloudprovider cluster.

        Args:
            - flow (Flow): the Flow object
            - **kwargs (Any): Unused
        """
        flow_run_info = None
        flow_run_id = prefect.context.get("flow_run_id")
        if self._on_execute:
            # If an on_execute Callable has been provided, retrieve the flow run parameters
            # and then allow the Callable a chance to update _provider_kwargs. This allows
            # better sizing of the cluster resources based on parameters for this Flow run.
            try:
                client = Client()
                flow_run_info = client.get_flow_run_info(flow_run_id)
                parameters = flow_run_info.parameters or {}  # type: ignore
                self._on_execute(parameters, self._provider_kwargs)
            except Exception as exc:
                self.logger.info(
                    "Failed to retrieve flow run info with error: {}".format(repr(exc))
                )
        if "image" not in self._provider_kwargs or not self._provider_kwargs.get(
            "image"
        ):
            # If image is not specified, use the Flow's image so that dependencies are
            # identical on all containers: Flow runner, Dask scheduler, and Dask workers
            flow_id = prefect.context.get("flow_id")
            try:
                client = Client()
                if not flow_id:  # We've observed cases where flow_id is None
                    if not flow_run_info:
                        flow_run_info = client.get_flow_run_info(flow_run_id)
                    flow_id = flow_run_info.flow_id
                flow_info = client.graphql(
                    """query {
                  flow(where: {id: {_eq: "%s"}}) {
                    storage
                  }
                }"""
                    % flow_id
                )
                storage_info = flow_info["data"]["flow"][0]["storage"]
                image = "{}/{}:{}".format(
                    storage_info["registry_url"],
                    storage_info["image_name"],
                    storage_info["image_tag"],
                )
                self.logger.info(
                    "Using Flow's Docker image for Dask scheduler & workers: {}".format(
                        image
                    )
                )
                self._provider_kwargs["image"] = image
            except Exception as exc:
                self.logger.info(
                    "Failed to retrieve flow info with error: {}".format(repr(exc))
                )

        self._create_dask_cluster()

        self.logger.info(
            "Executing on dynamically created Dask Cluster with scheduler address: {}".format(
                self.executor_kwargs["address"]
            )
        )
        if self.on_start:
            self.on_start()

        try:
            from prefect.engine import get_default_flow_runner_class
            from prefect.executors import DaskExecutor

            runner_cls = get_default_flow_runner_class()
            runner_cls(flow=flow).run(executor=DaskExecutor(**self.executor_kwargs))
        except Exception as exc:
            self.logger.exception(
                "Unexpected error raised during flow run: {}".format(exc)
            )
            raise
        finally:
            if self.on_exit:
                self.on_exit()
Exemplo n.º 26
0
def run_sigla_pipeline(master_spreadsheet_id: str,
                       google_api_credentials_path: str,
                       db_connection_url: str):
    """
    Run the SIGLA ETL pipeline

    Parameters
    ----------
    master_spreadsheet_id:
        The master spreadsheet id.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    db_connection_url: str
        The DB's connection url str.
    """
    log.info("Finished pipeline set up, start running pipeline")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("SIGLA Data Pipeline") as flow:
        # Delete all documents from db
        clean_up_task = _clean_up(db_connection_url)
        # Get spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id,
                                               google_api_credentials_path)
        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
            upstream_tasks=[unmapped(clean_up_task)],
        )

        # Transform list of SheetData into FormattedSheetData
        formatted_spreadsheets_data = _transform.map(
            flatten(spreadsheets_data))
        # Create instituton filter
        gs_institution_filter = _create_filter_task([
            gs_format.standard_institution,
            gs_format.multiple_sigla_answer_variable,
        ])
        # Filter to list of institutional formatted sheet data
        gs_institutions_data = gs_institution_filter(
            formatted_spreadsheets_data)
        # Create composite filter
        gs_composite_filter = _create_filter_task([
            gs_format.composite_variable,
            gs_format.institution_and_composite_variable,
        ])
        # Filter to list of composite formatted sheet data
        gs_composites_data = gs_composite_filter(formatted_spreadsheets_data)

        # Load instutional data
        load_institutions_data_task = _load_institutions_data.map(
            gs_institutions_data, unmapped(db_connection_url))
        # Load composite data
        load_composites_data_task = _load_composites_data.map(
            gs_composites_data,
            unmapped(db_connection_url),
            upstream_tasks=[unmapped(load_institutions_data_task)],
        )
        # Log spreadsheets that were loaded
        _log_spreadsheets(spreadsheets_data,
                          upstream_tasks=[load_composites_data_task])

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
Exemplo n.º 27
0
def run_external_link_checker(
    google_api_credentials_path: str,
    master_spreadsheet_id: Optional[str] = None,
    spreadsheet_ids_str: Optional[str] = None,
):
    """
    Run the the external link checker.
    If a list of spreadsheet ids are provided, run the external link checker
    against the list of spreadsheet ids, instead of the spreadsheet ids gathered
    from the master spreadsheet.

    Parameters
    ----------
    master_spreadsheet_id: str
        The master spreadsheet id.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    spreadsheet_ids_str: Optional[str]
        The list spreadsheet ids, delimited by comma.
    """
    log.info("Finished external link checker set up, start checking external link.")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("Check external links") as flow:
        # Get spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(
            master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str
        )

        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
        )
        # Extract links from list of SheetData
        # Get back list of list of URLData
        links_data = _extract_external_links.map(flatten(spreadsheets_data))
        # Unique the url data
        unique_links_data = _unique_external_links(flatten(links_data))
        # Check external links
        _check_external_link.map(unique_links_data)

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
    # Get the list of CheckedURL
    checked_links = state.result[flow.get_tasks(name="_check_external_link")[0]].result
    log.info("=" * 80)
    # Get error links
    error_links = [link for link in checked_links if link.has_error]
    gs_cells = []
    for error_link in error_links:
        for cell in error_link.url_data.cells:
            gs_cells.append(
                GoogleSheetCell(
                    spreadsheet_title=cell.spreadsheet_title,
                    sheet_title=cell.sheet_title,
                    row_index=cell.row_index,
                    col_index=cell.col_index,
                    url=error_link.url_data.url,
                    msg=error_link.msg,
                )
            )

    sorted_gs_cells = sorted(
        gs_cells,
        key=lambda x: (
            x.spreadsheet_title,
            x.sheet_title,
            x.row_index,
            x.col_index,
            x.url,
        ),
    )
    # Write error links to a csv file
    with open("external_links.csv", mode="w") as csv_file:
        fieldnames = ["spreadsheet_title", "sheet_title", "cell", "url", "reason"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t")
        writer.writeheader()
        for gs_cell in sorted_gs_cells:
            writer.writerow(
                {
                    "spreadsheet_title": gs_cell.spreadsheet_title,
                    "sheet_title": gs_cell.sheet_title,
                    "cell": convert_rowcol_to_A1_name(
                        gs_cell.row_index, gs_cell.col_index
                    ),
                    "url": gs_cell.url,
                    "reason": f"{gs_cell.msg}",
                }
            )
    log.info("Finished writing external links csv file")
Exemplo n.º 28
0
 def test_cant_specify_both_address_and_cluster_class(self):
     with pytest.raises(ValueError):
         DaskExecutor(
             address="localhost:8787",
             cluster_class=distributed.LocalCluster,
         )
Exemplo n.º 29
0
 def test_debug_is_converted_to_silence_logs(self, debug):
     executor = DaskExecutor(debug=debug)
     level = logging.WARNING if debug else logging.CRITICAL
     assert executor.cluster_kwargs["silence_logs"] == level
Exemplo n.º 30
0
            if self._worker_spec:
                worker_pod = self._worker_spec
                worker_pod = self._populate_worker_spec_yaml(yaml_obj=worker_pod)
            else:
                with open(
                    path.join(path.dirname(__file__), "worker_pod.yaml")
                ) as pod_file:
                    worker_pod = yaml.safe_load(pod_file)
                    worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod)

            cluster = KubeCluster.from_dict(
                worker_pod, namespace=prefect.context.get("namespace")
            )
            cluster.adapt(minimum=self.min_workers, maximum=self.max_workers)

            executor = DaskExecutor(address=cluster.scheduler_address)
            runner_cls = get_default_flow_runner_class()
            runner_cls(flow=flow).run(executor=executor)
        except Exception as exc:
            self.logger.exception(
                "Unexpected error raised during flow run: {}".format(exc)
            )
            raise exc
        finally:
            # Call on_exit callback if specified
            if self.on_exit:
                self.on_exit()

    def _extra_loggers(self) -> str:
        """
        Set dask-kubernetes related loggers for debugging and providing more