예제 #1
0
def test_dask_vs_numpy():
    client = Client()
    ddf = dd.read_csv("tests/test_attributes.csv")
    ddf = ddf.repartition(npartitions=4)
    g_ddf = gam.GAM(attributions=ddf)
    g_ddf.generate()

    df = pd.read_csv("tests/test_attributes.csv")
    g_df = gam.GAM(attributions=df)
    g_df.generate()

    assert g_ddf.attributions.all() == g_df.attributions.all()
    assert g_ddf.feature_labels == g_df.feature_labels
    client.close()
def test_find_optimal_clusters():
    """"Create sample attributions with 4 clusters"""
    attributions_file = "test_opt.csv"
    X, y = make_blobs(
        n_samples=40,
        n_features=2,
        centers=4,
        cluster_std=0.01,
        center_box=(0.0, 1.0),
        shuffle=True,
        random_state=42,
    )

    logging.info(f"blobs made - {X.shape},{y.shape}")
    df = pd.DataFrame(columns=["x1", "x2"], data=X)
    df.to_csv(attributions_file)

    """"Check cluster search via silhouette score"""
    g = gam.GAM(attributions_path=attributions_file, distance="kendall_tau")
    g.get_optimal_clustering(max_clusters=6, verbose=True)

    logging.info(f"attributions file - {g.attributions_path}")
    logging.info(f"what we settled on - {g.k}")

    assert g.silh_scores == [
        (-0.6997008553383604, 4),
        (-0.49646173501281243, 3),
        (-0.20264778322315316, 6),
        (-0.15419026951164008, 5),
        (0.3760497014154821, 2),
    ]
    assert g.k == 2
예제 #3
0
def test_read_csv():
    g = gam.GAM(attributions_path="tests/test_attributes.csv")
    g._read_local()

    assert hasattr(g, "attributions")
    assert g.attributions.shape == (4, 3)

    assert hasattr(g, "feature_labels")
    assert g.feature_labels == ["a1", "a2", "a3"]
예제 #4
0
def test_read_csv():
    g = gam.GAM(attributions_path="tests/test_attributes.csv")
    g._read_local()

    assert (hasattr(g, 'attributions'))
    assert (g.attributions.shape == (4, 3))

    assert (hasattr(g, 'feature_labels'))
    assert (g.feature_labels == ['a1', 'a2', 'a3'])
예제 #5
0
def test_normalize():
    """Tests normalization of attributions from csv"""
    g = gam.GAM(attributions_path="tests/test_attributes.csv")
    g._read_local()

    normalized_attributions = gam.GAM.normalize(g.attributions)

    assert normalized_attributions.shape == g.attributions.shape
    assert not np.any(np.where(normalized_attributions < 0))
    assert normalized_attributions.sum(axis=1)[0] == pytest.approx(1.0)
예제 #6
0
def test_plotting_2attributes():

    explanations = [[('height', .05), ('weight', .05), ('hair color', .9)],
                    [('height', .9), ('weight', .05), ('hair color', .05)]]

    g = gam.GAM(attributions_path="tests/test_attributes.csv",
                k=len(explanations))

    g.explanations = explanations

    fname = 'tests/image3'
    g.plot(num_features=2, output_path_base=fname, display=False)

    output = glob.glob(fname + '*')
    assert (len(output) > 0)
    for ofile in output:
        os.remove(ofile)
예제 #7
0
def test_plotting_2attributes():

    explanations = [
        [("height", 0.05), ("weight", 0.05), ("hair color", 0.9)],
        [("height", 0.9), ("weight", 0.05), ("hair color", 0.05)],
    ]

    g = gam.GAM(attributions_path="tests/test_attributes.csv", k=len(explanations))

    g.explanations = explanations

    fname = "tests/image3"
    g.plot(num_features=2, output_path_base=fname, display=False)

    output = glob.glob(fname + "*")
    assert len(output) > 0
    for ofile in output:
        os.remove(ofile)
예제 #8
0
def test_cluster():
    """Tests subpopulations generated by clustering attributions"""
    g = gam.GAM(attributions_path="tests/test_attributes.csv")
    g._read_local()
    # g.normalized_attributions = gam.GAM.normalize(g.attributions)
    g.clustering_attributions = gam.GAM.normalize(g.attributions)
    g._cluster()

    assert len(g.explanations) == 2
    assert g.subpopulation_sizes[0] > 0
    assert g.subpopulation_sizes[1] > 0
    assert len(g.explanations) == 2
    assert g.explanations[0][0][0] == g.feature_labels[0]

    first_explanation_sum = sum([weight for label, weight in g.explanations[0]])
    assert first_explanation_sum == pytest.approx(1)

    second_explanation_sum = sum([weight for label, weight in g.explanations[1]])
    assert second_explanation_sum == pytest.approx(1)
def test_find_optimal_clusters():
    """"Create sample attributions with 4 clusters"""
    attributions_file = 'test_opt.csv'
    X, y = make_blobs(n_samples=40, n_features=2, centers=4,
                      cluster_std=0.01, center_box=(0.0, 1.0),
                      shuffle=True, random_state=42)

    print('blobs made - ', X.shape, y.shape)
    df = pd.DataFrame(columns=['x1', 'x2'], data=X)
    df.to_csv(attributions_file)

    """"Check cluster search via silhouette score"""
    g = gam.GAM(attributions_path=attributions_file, distance="kendall_tau")
    g.get_optimal_clustering()

    print('attributions file - ', g.attributions_path)
    print('data size = ', g.normalized_attributions.shape)
    print('what we settled on - ', g.k)
    assert(g.k == 4)
예제 #10
0
def test_read_df_or_list():
    # preprocessing
    df = pd.read_csv("tests/test_attributes.csv")
    att_list = df.values.tolist()
    feat_labels_list = df.columns.tolist()

    att_arr = np.asarray(df.values.tolist())
    feat_labels_arr = np.asarray(df.columns.tolist())

    client = Client()
    ddf = dd.read_csv("tests/test_attributes.csv")
    ddf = ddf.repartition(npartitions=4)
    dask_att_arr = da.from_array(att_list)
    dask_feat_labels_arr = da.from_array(feat_labels_list)

    # Testing dask DataFrame
    g_ddf = gam.GAM(attributions=ddf)
    g_ddf.generate()

    assert hasattr(g_ddf, "attributions")
    assert g_ddf.attributions.shape == (4, 3)

    assert hasattr(g_ddf, "feature_labels")
    assert g_ddf.feature_labels == ["a1", "a2", "a3"]

    # Testing dask array
    g_dask_list = gam.GAM(attributions=dask_att_arr, batchsize=100, feature_labels=dask_feat_labels_arr)
    g_dask_list.generate()

    assert hasattr(g_dask_list, "attributions")
    assert g_dask_list.attributions.shape == (4, 3)

    assert hasattr(g_dask_list, "feature_labels")
    assert g_dask_list.feature_labels == ["a1", "a2", "a3"]
    client.close()

    # Testing DataFrame
    g_df = gam.GAM(attributions=df)
    g_df.generate()

    assert hasattr(g_df, "attributions")
    assert g_df.attributions.shape == (4, 3)

    assert hasattr(g_df, "feature_labels")
    assert g_df.feature_labels == ["a1", "a2", "a3"]

    # Testing lists
    g_list = gam.GAM(attributions=att_list, batchsize=100, feature_labels=feat_labels_list)
    g_list.generate()

    assert hasattr(g_list, "attributions")
    assert g_list.attributions.shape == (4, 3)

    assert hasattr(g_list, "feature_labels")
    assert g_list.feature_labels == ["a1", "a2", "a3"]
    
    # Testing numpy arrays
    g_arr = gam.GAM(attributions=att_arr, batchsize=100, feature_labels=feat_labels_arr)
    g_arr.generate()

    assert hasattr(g_arr, "attributions")
    assert g_arr.attributions.shape == (4, 3)

    assert hasattr(g_arr, "feature_labels")
    assert g_arr.feature_labels == ["a1", "a2", "a3"]
    
    # Testing failure
    with pytest.raises(ValueError):
        g_fail = gam.GAM(attributions=att_arr, batchsize=100)
        g_fail.generate()