def test_create_client_and_cluster(entityset, monkeypatch):
    def test_cluster(n_workers=1,
                     threads_per_worker=1,
                     diagnostics_port=8787,
                     **dask_kwarg):
        return (n_workers, threads_per_worker, diagnostics_port)
    monkeypatch.setitem(create_client_and_cluster.__globals__, 'LocalCluster', test_cluster)
    monkeypatch.setitem(create_client_and_cluster.__globals__, 'Client', lambda x: x)

    # cluster in dask_kwargs case
    client, cluster = create_client_and_cluster(n_jobs=2,
                                                num_tasks=3,
                                                dask_kwargs={'cluster': 'tcp://127.0.0.1:54321'})
    assert cluster == 'tcp://127.0.0.1:54321'

    try:
        cpus = len(psutil.Process().cpu_affinity())
    except AttributeError:
        cpus = psutil.cpu_count()

    # jobs < tasks case
    client, cluster = create_client_and_cluster(n_jobs=2,
                                                num_tasks=3,
                                                dask_kwargs={})
    assert cluster == (min(cpus, 2), 1, None)
    # jobs > tasks case
    client, cluster = create_client_and_cluster(n_jobs=10,
                                                num_tasks=3,
                                                dask_kwargs={'diagnostics_port': 8789})
    assert cluster == (min(cpus, 3), 1, 8789)
示例#2
0
def test_create_client_and_cluster(entityset, monkeypatch):
    def test_cluster(n_workers=1,
                     threads_per_worker=1,
                     diagnostics_port=8787,
                     **dask_kwarg):
        return (n_workers, threads_per_worker, diagnostics_port)

    monkeypatch.setitem(create_client_and_cluster.__globals__, 'LocalCluster',
                        test_cluster)
    monkeypatch.setitem(create_client_and_cluster.__globals__, 'Client',
                        lambda x: x)

    # cluster in dask_kwargs case
    client, cluster = create_client_and_cluster(
        n_jobs=2,
        num_tasks=3,
        dask_kwargs={'cluster': 'tcp://127.0.0.1:54321'})
    assert cluster == 'tcp://127.0.0.1:54321'
    # jobs < tasks case
    client, cluster = create_client_and_cluster(n_jobs=2,
                                                num_tasks=3,
                                                dask_kwargs={})
    assert cluster == (2, 1, None)
    # jobs > tasks case
    client, cluster = create_client_and_cluster(
        n_jobs=10, num_tasks=3, dask_kwargs={'diagnostics_port': 8789})
    assert cluster == (3, 1, 8789)
def test_create_client_and_cluster(entityset, monkeypatch):
    def test_cluster(n_workers=1,
                     threads_per_worker=1,
                     diagnostics_port=8787,
                     **dask_kwarg):
        return (n_workers, threads_per_worker, diagnostics_port)

    monkeypatch.setitem(create_client_and_cluster.__globals__, 'LocalCluster',
                        test_cluster)
    monkeypatch.setitem(create_client_and_cluster.__globals__, 'Client',
                        lambda x: x)

    # cluster in dask_kwargs case
    client, cluster = create_client_and_cluster(
        n_jobs=2,
        num_tasks=3,
        dask_kwargs={'cluster': 'tcp://127.0.0.1:54321'})
    assert cluster == 'tcp://127.0.0.1:54321'

    try:
        cpus = len(psutil.Process().cpu_affinity())
    except AttributeError:
        cpus = psutil.cpu_count()

    # jobs < tasks case
    client, cluster = create_client_and_cluster(n_jobs=2,
                                                num_tasks=3,
                                                dask_kwargs={})
    assert cluster == (min(cpus, 2), 1, None)
    # jobs > tasks case
    client, cluster = create_client_and_cluster(
        n_jobs=10, num_tasks=3, dask_kwargs={'diagnostics_port': 8789})
    assert cluster == (min(cpus, 3), 1, 8789)
def parallel_calculate_chunks(chunks,
                              feature_set,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import as_completed, Future
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        client, cluster = create_client_and_cluster(
            n_jobs=n_jobs,
            num_tasks=len(chunks),
            dask_kwargs=dask_kwargs,
            entityset_size=entityset.__sizeof__())
        # scatter the entityset
        # denote future with leading underscore
        if verbose:
            start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            if verbose:
                msg = "Using EntitySet persisted on the cluster as dataset {}"
                print(msg.format(es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(feature_set)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        num_scattered_workers = len(
            client.who_has([Future(es_token)]).get(es_token, []))
        num_workers = len(client.scheduler_info()['workers'].values())

        scatter_warning(num_scattered_workers, num_workers)
        if verbose:
            end = time.time()
            scatter_time = round(end - start)
            scatter_string = "EntitySet scattered to {} workers in {} seconds"
            print(scatter_string.format(num_scattered_workers, scatter_time))
        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             feature_set=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
def parallel_calculate_chunks(cutoff_time,
                              chunk_size,
                              feature_set,
                              approximate,
                              training_window,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              progress_bar,
                              dask_kwargs=None,
                              progress_callback=None):
    from distributed import as_completed, Future
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        client, cluster = create_client_and_cluster(
            n_jobs=n_jobs,
            dask_kwargs=dask_kwargs,
            entityset_size=entityset.__sizeof__())
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            msg = "Using EntitySet persisted on the cluster as dataset {}"
            progress_bar.write(msg.format(es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(feature_set)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        num_scattered_workers = len(
            client.who_has([Future(es_token)]).get(es_token, []))
        num_workers = len(client.scheduler_info()['workers'].values())

        chunks = cutoff_time.groupby(cutoff_df_time_var)

        if not chunk_size:
            chunk_size = _handle_chunk_size(1.0 / num_workers,
                                            cutoff_time.shape[0])

        chunks = _chunk_dataframe_groups(chunks, chunk_size)

        chunks = [df for _, df in chunks]

        if len(chunks) < num_workers:
            chunk_warning = "Fewer chunks ({}), than workers ({}) consider reducing the chunk size"
            warning_string = chunk_warning.format(len(chunks), num_workers)
            progress_bar.write(warning_string)

        scatter_warning(num_scattered_workers, num_workers)
        end = time.time()
        scatter_time = round(end - start)

        # if enabled, reset timer after scatter for better time remaining estimates
        if not progress_bar.disable:
            progress_bar.reset()

        scatter_string = "EntitySet scattered to {} workers in {} seconds"
        progress_bar.write(
            scatter_string.format(num_scattered_workers, scatter_time))
        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             feature_set=_saved_features,
                             chunk_size=None,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns,
                             progress_bar=None,
                             progress_callback=progress_callback)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                previous_progress = progress_bar.n
                progress_bar.update(result.shape[0])
                if progress_callback is not None:
                    update, progress_percent, time_elapsed = update_progress_callback_parameters(
                        progress_bar, previous_progress)
                    progress_callback(update, progress_percent, time_elapsed)

    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()

        if client is not None:
            client.close()

    feature_matrix = pd.concat(feature_matrix)

    return feature_matrix