Exemplo n.º 1
0
def test_initialize_models():
    test_tablename, _ = create_dha(path='data/dha_missing.csv')

    engine = Engine(seed=0)
    num_models = 5
    engine.initialize_models(test_tablename, num_models)

    model_ids = engine.persistence_layer.get_model_ids(test_tablename)
    assert sorted(model_ids) == range(num_models)
    for i in range(num_models):
        model = engine.persistence_layer.get_models(test_tablename, i)
        assert model['iterations'] == 0
Exemplo n.º 2
0
def test_initialize_models():
    test_tablename, _ = create_dha(path='data/dha_missing.csv')

    engine = Engine(seed=0)
    num_models = 5
    engine.initialize_models(test_tablename, num_models)

    model_ids = engine.persistence_layer.get_model_ids(test_tablename)
    assert sorted(model_ids) == range(num_models)
    for i in range(num_models):
        model = engine.persistence_layer.get_models(test_tablename, i)
        assert model['iterations'] == 0
Exemplo n.º 3
0
def test_infer():
    # TODO: whereclauses
    test_tablename, _ = create_dha(path='data/dha_missing.csv')

    # dha_missing has missing qual_score in first 5 rows, and missing name in rows 6-10.
    engine = Engine(seed=0)
    engine.initialize_models(test_tablename, 20)

    functions = bql.bql_statement.parseString('infer name, qual_score from test',
                                              parseAll=True).functions
    whereclause = None
    limit = float('inf')
    order_by = False
    numsamples = 30
    confidence = 0
    infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit,
                                numsamples, order_by)
    assert 'column_labels' in infer_result
    assert 'data' in infer_result
    assert infer_result['column_labels'] == ['key', 'name', 'qual_score']
    # 307 is the total number of rows in the dataset.
    assert(len(infer_result['data']) == 307 and
           len(infer_result['data'][0]) == len(infer_result['column_labels']))
    assert type(infer_result['data'][0][0]) == numpy.string_  # type of key is int
    t = type(infer_result['data'][0][1])
    assert (t == unicode) or (t == numpy.string_)  # type of name is string
    assert type(infer_result['data'][0][2]) == float  # type of qual_score is float

    all_possible_names = [infer_result['data'][row][1] for row in range(5) + range(10, 307)]
    all_observed_qual_scores = [infer_result['data'][row][2] for row in range(5, 307)]

    for row in range(5):
        inferred_name = infer_result['data'][row+5][1]
        inferred_qual_score = infer_result['data'][row][2]
        assert inferred_name in all_possible_names
        assert type(inferred_qual_score) == type(1.2)
        assert inferred_qual_score > min(all_observed_qual_scores)
        assert inferred_qual_score < max(all_observed_qual_scores)

    # Now, try infer with higher confidence, and make sure that name isn't inferred anymore.
    confidence = 0.9
    infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit,
                                numsamples, order_by)

    for row in range(5):
        # TODO: what do missing values look like? these should be missing
        inferred_name = infer_result['data'][row+5][1]
        inferred_qual_score = infer_result['data'][row][2]
        assert numpy.isnan(inferred_name)
        assert numpy.isnan(inferred_qual_score)
Exemplo n.º 4
0
def test_infer():
    # TODO: whereclauses
    test_tablename, _ = create_dha(path='data/dha_missing.csv')

    # dha_missing has missing qual_score in first 5 rows, and missing name in rows 6-10.
    engine = Engine(seed=0)
    engine.initialize_models(test_tablename, 20)

    functions = bql.bql_statement.parseString('infer name, qual_score from test',
                                              parseAll=True).functions
    whereclause = None
    limit = float('inf')
    order_by = False
    numsamples = 30
    confidence = 0
    infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit,
                                numsamples, order_by)
    assert 'column_labels' in infer_result
    assert 'data' in infer_result
    assert infer_result['column_labels'] == ['key', 'name', 'qual_score']
    # 307 is the total number of rows in the dataset.
    assert(len(infer_result['data']) == 307 and
           len(infer_result['data'][0]) == len(infer_result['column_labels']))
    assert type(infer_result['data'][0][0]) == numpy.string_  # type of key is int
    t = type(infer_result['data'][0][1])
    assert (t == unicode) or (t == numpy.string_)  # type of name is string
    assert type(infer_result['data'][0][2]) == float  # type of qual_score is float

    all_possible_names = [infer_result['data'][row][1] for row in range(5) + range(10, 307)]
    all_observed_qual_scores = [infer_result['data'][row][2] for row in range(5, 307)]

    for row in range(5):
        inferred_name = infer_result['data'][row+5][1]
        inferred_qual_score = infer_result['data'][row][2]
        assert inferred_name in all_possible_names
        assert type(inferred_qual_score) == type(1.2)
        assert inferred_qual_score > min(all_observed_qual_scores)
        assert inferred_qual_score < max(all_observed_qual_scores)

    # Now, try infer with higher confidence, and make sure that name isn't inferred anymore.
    confidence = 0.9
    infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit,
                                numsamples, order_by)

    for row in range(5):
        # TODO: what do missing values look like? these should be missing
        inferred_name = infer_result['data'][row+5][1]
        inferred_qual_score = infer_result['data'][row][2]
        assert numpy.isnan(inferred_name)
        assert numpy.isnan(inferred_qual_score)