def test_find_correlation_threshold_works(): x = range(1000) noise = np.random.randn(1000) y = [a + b for a, b in zip(x, noise)] z = np.random.randn(1000) df = pd.DataFrame(list(zip(x, y, z)), columns=["x", "y", "z"]) out = feature_selection.find_correlation(df, threshold=1.0) assert len(out) == 0
def test_find_correlation_large_n(): x = range(100000) noise = np.random.randn(100000) y = [a + b for a, b in zip(x, noise)] z = np.random.randn(100000) df = pd.DataFrame(list(zip(x, y, z)), columns=["x", "y", "z"]) out = feature_selection.find_correlation(df) assert len(out) == 1 assert out[0] == ["x"] or ["y"] assert out[0] != ["z"]
def test_find_correlation_multiple_correlated(): x = range(1000) noise = np.random.randn(1000) y = [a + b for a, b in zip(x, noise)] xx = [a + b for a, b in zip(x, noise)] z = np.random.randn(1000) df = pd.DataFrame(list(zip(x, xx, y, z)), columns=["x", "xx", "y", "z"]) out = feature_selection.find_correlation(df) assert len(out) == 2 assert "z" not in out