Пример #1
0
def make_phase_1():

    for d, k in [(3, 3), (3, 5), (3, 8), (5, 3), (8, 3)]:
        dst = get_dirname(d, 80 * d * k, k, 0.8, makedir=True)

        print(dst)
        for _ in tqdm(range(100)):

            ds = bmcc.GaussianMixture(
                n=80 * d * k, k=k, d=d, r=0.8, alpha=40, df=d,
                symmetric=False, shuffle=False)
            ds.save(os.path.join(dst, str(uuid.uuid4())))
Пример #2
0
def run_evaluate(args):
    """Evaluate samples"""

    path, method_name = args

    result_dir = path.replace(BASE_DIR, os.path.join(RESULT_DIR, method_name))
    eval_dir = path.replace(BASE_DIR, os.path.join(EVAL_DIR, method_name))

    # Ignore if already run
    if os.path.exists(eval_dir + "_scores.json"):
        return

    # Load dataset
    dataset = bmcc.GaussianMixture(path, load=True)
    hist = np.load(result_dir)['hist']

    if hist.shape[0] < 2:
        with open(eval_dir + "_scores.json", "w") as f:
            f.write(json.dumps({"errored": "no iterations saved"}))
        return

    # If procedure terminates before 2000it (400 once thinned), use 2nd half of
    # samples
    bi_base = min(int(hist.shape[0] / 2), 200)

    # Base evaluation
    # We don't care about the oracle matrix for now, so skip computation
    res = bmcc.LstsqResult(dataset.data, hist, burn_in=bi_base)
    res.evaluate(
        dataset.assignments,
        oracle=dataset.oracle,
        oracle_matrix=None)

    # save trace
    save_fig(res.trace(), eval_dir + "_trace")

    # Save scores
    scores = {
        "rand": res.rand_best,
        "nmi": res.nmi_best,
        "oracle_rand": res.oracle_rand,
        "oracle_nmi": res.oracle_nmi,
        "num_clusters": int(res.num_clusters[res.best_idx]),
        "best_idx": int(res.best_idx),
        "aggregation": res.aggregation_best,
        "segregation": res.segregation_best,
        "oracle_aggregation": res.oracle_aggregation,
        "oracle_segregation": res.oracle_segregation,
        "iterations": hist.shape[0]
    }
    with open(eval_dir + "_scores.json", "w") as f:
        f.write(json.dumps(scores))
Пример #3
0
def make_phase_2():

    for d in [3, 4, 5, 6, 8, 10, 12, 15, 18, 21]:
        for n in [600, 800, 1000]:

            dst = get_dirname(d, n, 3, 1.0, makedir=True)

            print(dst)
            for _ in tqdm(range(100)):

                ds = bmcc.GaussianMixture(
                    n=n, k=3, d=d, r=1.0, alpha=40, df=d,
                    symmetric=False, shuffle=False)
                ds.save(os.path.join(dst, str(uuid.uuid4())))
Пример #4
0
def run_sample(args):
    """Run MCMC sampling"""

    # Unpack
    path, method_name = args

    dst = path.replace(
        BASE_DIR, os.path.join(RESULT_DIR, method_name))

    # Ignore if test already run (file present)
    if os.path.exists(dst):
        return

    # Load dataset
    dataset = bmcc.GaussianMixture(path, load=True)

    # Fetch method
    method = METHODS[method_name]

    cm = bmcc.NormalWishart(
        df=dataset.d,
        scale=np.identity(dataset.d) if SCALE_MATRIX else None)
    mm = method["mixture"](dataset.k)

    # Create model
    model = bmcc.BayesianMixture(
        data=dataset.data,
        sampler=method["sampler"],
        component_model=cm,
        mixture_model=mm,
        assignments=np.zeros(dataset.n).astype(np.uint16),
        thinning=5)

    # Run iterations (break on exceeding limit)
    try:
        for i in range(5000):
            model.iter()
            if np.max(model.assignments) > CLUSTERS_LIMIT:
                break
    except Exception as e:
        print("Exception in {} / {}:".format(method_name, path))
        print(e)

    np.savez(dst, hist=model.hist)
Пример #5
0
def make_phase_3():

    for k in [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100]:

        dst = get_dirname(3, 50 * k, k, 1.0, makedir=True)

        print(dst)
        for _ in tqdm(range(100)):
            # means = [
            #     (40 * k)**(1/3) * np.random.uniform(
            #         low=-0.5, high=0.5, size=3)
            #     for _ in range(k)
            # ]
            means = [
                (40 * k)**(1 / 3) * np.array(x)
                for x in ghalton.Halton(3).get(k)
            ]

            ds = bmcc.GaussianMixture(
                n=50 * k, k=k, d=3, r=1.0, df=3,
                symmetric=False, shuffle=False, means=means)
            ds.save(os.path.join(dst, str(uuid.uuid4())))
Пример #6
0
from scipy.stats import poisson
import time
from tqdm import tqdm
import bmcc

# Settings
ITERATIONS = 500
THINNING = 5
POINTS = 500
BURN_IN = 50

# Create dataset
dataset = bmcc.GaussianMixture(n=POINTS,
                               k=3,
                               d=3,
                               r=0.7,
                               alpha=40,
                               df=3,
                               symmetric=False,
                               shuffle=False)


def hybrid(*args, **kwargs):

    for _ in range(5):
        bmcc.gibbs(*args, **kwargs)
    bmcc.split_merge(*args, **kwargs)


mm = bmcc.MFM(gamma=1, prior=lambda k: poisson.logpmf(k, 3))
# mm = bmcc.DPM(alpha=1, use_eb=False)
cm = bmcc.NormalWishart(df=3)
Пример #7
0
import bmcc

dataset = bmcc.GaussianMixture(n=1000,
                               k=3,
                               d=3,
                               r=0.7,
                               alpha=40,
                               df=3,
                               symmetric=False,
                               shuffle=False)
dataset.plot_actual(plot=True)
dataset.save("tmp.npz")

dataset = bmcc.GaussianMixture("tmp.npz", load=True)
dataset.plot_actual(plot=True)