예제 #1
0
def test_multiclass_classification_metrics_001(tc):
    print "create frame"
    rows = [["red", "red"], ["blue", "green"], ["green", "green"],
            ["green", "green"], ["orange", "orange"], ["red", "orange"]]
    schema = [('labels', str), ('predictions', str)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 6 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute multiclass_classification_metrics()"
    cm = frame.multiclass_classification_metrics('labels', 'predictions', 1)

    assert (cm.f_measure, 0.6,
            "computed f_measure for this model should be equal to 0.6")
    assert (cm.recall, 0.666666666667,
            "computed recall for this model should be equal to 0.666666666667")
    assert (
        cm.accuracy, 0.666666666667,
        "computed accuracy for this model should be equal to 0.666666666667")
    assert (
        cm.precision, 0.638888888889,
        "computed precision for this model should be equal to 0.638888888889")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [1, 0, 0], [2, 0, 0], [0, 1, 0], [0, 1, 1]
    ], "computed confusion_matrix for this models should be equal to [1,0,0],[2,0,0],[0,1,0],[0,1,1]"
            )
예제 #2
0
def test_kmeans_save_load(tc):

    frame = tc.to_frame([[2, "ab"],
                         [1,"cd"],
                         [7,"ef"],
                         [1,"gh"],
                         [9,"ij"],
                         [2,"kl"],
                         [0,"mn"],
                         [6,"op"],
                         [5,"qr"]],
                        [("data", float), ("name", str)])
    model = kmeans.train(frame, ["data"], 3, seed=5)
    assert (model.k == 3)
    assert (model.columns == [u'data'])
    assert (model.scalings is None)

    sizes = model.compute_sizes(frame)
    assert (sizes == [4, 1, 4])

    centroids = model.centroids

    model.save("sandbox/km1")

    restored = tc.load("sandbox/km1")
    assert(restored.centroids == centroids)
    restored_sizes = restored.compute_sizes(frame)
    assert (restored_sizes == sizes)
예제 #3
0
def test_multiclass_classification_metrics_002(tc):
    print "create frame"
    rows = [[0.0, 0.0], [None, 0.0], [0.0, 0.0], [1.5, 1.5], [1.0, 1.0],
            [1.5, None]]
    schema = [('labels', float32), ('predictions', float32)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 6 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute multiclass_classification_metrics()"
    cm = frame.multiclass_classification_metrics('labels', 'predictions', 1)

    assert (
        cm.f_measure, 0.627777777778,
        "computed f_measure for this model should be equal to 0.627777777778")
    assert (cm.recall, 0.666666666667,
            "computed recall for this model should be equal to 0.666666666667")
    assert (
        cm.accuracy, 0.666666666667,
        "computed accuracy for this model should be equal to 0.666666666667")
    assert (
        cm.precision, 0.805555555556,
        "computed precision for this model should be equal to 0.805555555556")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [2, 0, 0], [0, 1, 0], [1, 1, 1]
    ], "computed confusion_matrix for this models should be equal to [2,0,0],[0,1,0],[1,1,1]"
            )
예제 #4
0
def test_back_and_forth_py_scala(tc):
    # python
    f = tc.to_frame([[1, "one"],
                     [2, "two"],
                     [3, "three"],
                     [4, "four"],
                     [5, "five"],
                     [6, "six"],
                     [7, "seven"],
                     [8, "eight"],
                     [9, "nine"],
                     [10, "ten"]],
                     [("a", int32), ("b", str)])
    # python
    f.add_columns(lambda row: row.a + 4, ("c", int))
    # scala
    f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80])
    # python
    f.filter(lambda row: row.a > 5)
    results = str(f.inspect())
    expected = """[#]  a   b      c   a_binned
============================
[0]   6  six    10         0
[1]   7  seven  11         0
[2]   8  eight  12         1
[3]   9  nine   13         1
[4]  10  ten    14         2"""
    assert(results == expected)
def test_binary_classification_metrics_001(tc):
    print "create frame"
    rows = [["red", "red"], ["blue", "green"], ["green", "green"],
            ["green", "green"]]
    schema = [('labels', str), ('predictions', str)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 4 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute binary_classification_metrics()"
    cm = frame.binary_classification_metrics('labels', 'predictions', 'green',
                                             1)

    assert (cm.f_measure, 0.0,
            "computed f_measure for this model should be equal to 0.0")
    assert (cm.recall, 0.0,
            "computed recall for this model should be equal to 0.0")
    assert (cm.accuracy, 0.5,
            "computed accuracy for this model should be equal to 0.5")
    assert (cm.precision, 0.0,
            "computed precision for this model should be equal to 0.0")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [0, 2], [0, 2]
    ], "computed confusion_matrix for this models should be equal to [[0, 2], [0, 2]]"
            )
def test_binary_classification_metrics_002(tc):
    print "create frame"
    rows = [[0.0, 0.0], [1.5, 0.0], [0.0, 0.0], [1.5, 1.5]]
    schema = [('labels', float32), ('predictions', float32)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 4 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute binary_classification_metrics()"
    cm = frame.binary_classification_metrics('labels', 'predictions', 1.5, 1)

    assert (
        cm.f_measure, 0.66666666666666663,
        "computed f_measure for this model should be equal to 0.66666666666666663"
    )
    assert (cm.recall, 0.5,
            "computed recall for this model should be equal to 0.5")
    assert (cm.accuracy, 0.75,
            "computed accuracy for this model should be equal to 0.75")
    assert (cm.precision, 1.0,
            "computed precision for this model should be equal to 1.0")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [1, 1], [0, 2]
    ], "computed confusion_matrix for this models should be equal to [[1, 1], [0, 2]]"
            )
예제 #7
0
def est_np(tc):
    # We can't use numpy numeric types and go successfully to Scala RDDs --the unpickler gets a constructor error:
    # Caused by: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype)
    # todo: get this test working!
    # when it works, go back to dtypes and enable the np types
    import numpy as np
    f = tc.to_frame([[np.int32(1), "one"], [np.int32(2), "two"]], [("a", int), ("b", str)])  # schema intentionally int, not np.int32
    #print f.inspect()
    # force to_scala
    f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80])
예제 #8
0
def test_row_count(tc):
    # create frame
    f = tc.to_frame([[item] for item in range(0, 10)],[("a", int)])
    # check row count (python)
    assert(f._is_python == True)
    assert(f.row_count == 10)
    # to scala
    f._scala
    # check row count (scala)
    assert(f._is_python == False)
    assert(f.row_count == 10)
예제 #9
0
def test_save_load(tc):
    path = get_sandbox_path("briton1")
    rm(path)
    frame1 = tc.to_frame([[2,"ab"],[1.0,"cd"],[7.4,"ef"],[1.0,"gh"],[9.0,"ij"],[2.0,"kl"],[0,"mn"],[6.0,"op"],[5.0,"qr"]],
                         [("data", float),("name", str)])
    frame1_inspect = frame1.inspect()
    frame1.save(path)
    frame2 = tc.load(path)
    frame2_inspect = frame2.inspect()
    assert(frame1_inspect, frame2_inspect)
    assert(str(frame1.schema), str(frame2.schema))
예제 #10
0
def test_bin(tc):
    f = tc.to_frame([[1, "one"],
                     [2, "two"],
                     [3, "three"],
                     [4, "four"],
                     [5, "five"],
                     [6, "six"],
                     [7, "seven"],
                     [8, "eight"],
                     [9, "nine"],
                     [10, "ten"]],
                    [("a", int), ("b", str)])
    f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80]) #, bin_column_name="super_fred")
예제 #11
0
def test_kmeans(tc):

    frame = tc.to_frame([[2, "ab"],
                         [1,"cd"],
                         [7,"ef"],
                         [1,"gh"],
                         [9,"ij"],
                         [2,"kl"],
                         [0,"mn"],
                         [6,"op"],
                         [5,"qr"]],
                        [("data", float), ("name", str)])
    model = kmeans.train(frame, ["data"], 3, seed=5)
    assert (model.k == 3)

    sizes = model.compute_sizes(frame)
    assert (sizes == [4, 1, 4])

    wsse = model.compute_wsse(frame)
    assert (wsse == 9.75)

    model.predict(frame)
    frame_inspect = str(frame.inspect())
    assert (frame_inspect == """[#]  data  name  cluster
========================
[0]   2.0  ab          0
[1]   1.0  cd          0
[2]   7.0  ef          1
[3]   1.0  gh          0
[4]   9.0  ij          1
[5]   2.0  kl          0
[6]   0.0  mn          2
[7]   6.0  op          1
[8]   5.0  qr          1""")

    model.add_distance_columns(frame)
    #print frame.inspect()
    frame_inspect = str(frame.inspect())
    assert (frame_inspect == """[#]  data  name  cluster  distance0  distance1  distance2
=========================================================
[0]   2.0  ab          0       0.25    22.5625        4.0
[1]   1.0  cd          0       0.25    33.0625        1.0
[2]   7.0  ef          1      30.25     0.0625       49.0
[3]   1.0  gh          0       0.25    33.0625        1.0
[4]   9.0  ij          1      56.25     5.0625       81.0
[5]   2.0  kl          0       0.25    22.5625        4.0
[6]   0.0  mn          2       2.25    45.5625        0.0
[7]   6.0  op          1      20.25     0.5625       36.0
[8]   5.0  qr          1      12.25     3.0625       25.0""")
예제 #12
0
def test_smoke_take(tc):
    f = tc.to_frame([[1, "one"], [2, "two"], [3, "three"]])
    t = f.take(2)
    assert t.data == [[1, 'one'], [2, 'two']]