Exemplo n.º 1
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [
        (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])
    ]

    job.pipeline = [("split",
                     Stage("kmeans_predict",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=predict_map))]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemplo n.º 2
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [(i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])]

    job.pipeline = [
        (
            "split",
            Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map),
        )
    ]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemplo n.º 3
0
def fit(dataset,
        n_clusters=5,
        max_iterations=10,
        random_state=None,
        save_results=True,
        show=False):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    try:
        n_clusters = int(n_clusters)
        max_iterations = int(max_iterations)
        if n_clusters < 2:
            raise Exception("Parameter n_clusters should be greater than 1.")
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("kmeans_init_map",
                           input_chain=dataset.params["input_chain"],
                           init=map_init,
                           process=random_init_map)),
                    ('group_label',
                     Stage("kmeans_init_reduce",
                           process=estimate_reduce,
                           init=simple_init,
                           combine=True))]
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params['seed'] = random_state
    job.params['k'] = n_clusters

    job.run(input=dataset.params["data_tag"], name="kmeans_init")
    init = job.wait(show=show)
    centers = [(i, c) for i, c in result_iterator(init)]

    for j in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        job.params = dict(dataset.params.items() + mean_point_center.items())
        job.params['k'] = n_clusters
        job.params['centers'] = centers

        job.pipeline = [('split',
                         Stage("kmeans_map_iter_%s" % (j + 1, ),
                               input_chain=dataset.params["input_chain"],
                               process=estimate_map,
                               init=simple_init)),
                        ('group_label',
                         Stage("kmeans_reduce_iter_%s" % (j + 1, ),
                               process=estimate_reduce,
                               init=simple_init,
                               combine=True))]

        job.run(input=dataset.params["data_tag"],
                name='kmeans_iter_%d' % (j + 1, ))
        fitmodel_url = job.wait(show=show)
        centers = [(i, c) for i, c in result_iterator(fitmodel_url)]

    return {"kmeans_fitmodel": fitmodel_url}  # return results url
Exemplo n.º 4
0
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    try:
        n_clusters = int(n_clusters)
        max_iterations = int(max_iterations)
        if n_clusters < 2:
            raise Exception("Parameter n_clusters should be greater than 1.")
        if max_iterations < 1:
            raise Exception("Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        (
            "split",
            Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map),
        ),
        ("group_label", Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True)),
    ]
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["seed"] = random_state
    job.params["k"] = n_clusters

    job.run(input=dataset.params["data_tag"], name="kmeans_init")
    init = job.wait(show=show)
    centers = [(i, c) for i, c in result_iterator(init)]

    for j in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        job.params = dict(dataset.params.items() + mean_point_center.items())
        job.params["k"] = n_clusters
        job.params["centers"] = centers

        job.pipeline = [
            (
                "split",
                Stage(
                    "kmeans_map_iter_%s" % (j + 1,),
                    input_chain=dataset.params["input_chain"],
                    process=estimate_map,
                    init=simple_init,
                ),
            ),
            (
                "group_label",
                Stage("kmeans_reduce_iter_%s" % (j + 1,), process=estimate_reduce, init=simple_init, combine=True),
            ),
        ]

        job.run(input=dataset.params["data_tag"], name="kmeans_iter_%d" % (j + 1,))
        fitmodel_url = job.wait(show=show)
        centers = [(i, c) for i, c in result_iterator(fitmodel_url)]

    return {"kmeans_fitmodel": fitmodel_url}  # return results url