def test_initialize_models(): test_tablename, _ = create_dha(path='data/dha_missing.csv') engine = Engine(seed=0) num_models = 5 engine.initialize_models(test_tablename, num_models) model_ids = engine.persistence_layer.get_model_ids(test_tablename) assert sorted(model_ids) == range(num_models) for i in range(num_models): model = engine.persistence_layer.get_models(test_tablename, i) assert model['iterations'] == 0
def test_initialize_models(): test_tablename, _ = create_dha(path='data/dha_missing.csv') engine = Engine(seed=0) num_models = 5 engine.initialize_models(test_tablename, num_models) model_ids = engine.persistence_layer.get_model_ids(test_tablename) assert sorted(model_ids) == range(num_models) for i in range(num_models): model = engine.persistence_layer.get_models(test_tablename, i) assert model['iterations'] == 0
def test_infer(): # TODO: whereclauses test_tablename, _ = create_dha(path='data/dha_missing.csv') # dha_missing has missing qual_score in first 5 rows, and missing name in rows 6-10. engine = Engine(seed=0) engine.initialize_models(test_tablename, 20) functions = bql.bql_statement.parseString('infer name, qual_score from test', parseAll=True).functions whereclause = None limit = float('inf') order_by = False numsamples = 30 confidence = 0 infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit, numsamples, order_by) assert 'column_labels' in infer_result assert 'data' in infer_result assert infer_result['column_labels'] == ['key', 'name', 'qual_score'] # 307 is the total number of rows in the dataset. assert(len(infer_result['data']) == 307 and len(infer_result['data'][0]) == len(infer_result['column_labels'])) assert type(infer_result['data'][0][0]) == numpy.string_ # type of key is int t = type(infer_result['data'][0][1]) assert (t == unicode) or (t == numpy.string_) # type of name is string assert type(infer_result['data'][0][2]) == float # type of qual_score is float all_possible_names = [infer_result['data'][row][1] for row in range(5) + range(10, 307)] all_observed_qual_scores = [infer_result['data'][row][2] for row in range(5, 307)] for row in range(5): inferred_name = infer_result['data'][row+5][1] inferred_qual_score = infer_result['data'][row][2] assert inferred_name in all_possible_names assert type(inferred_qual_score) == type(1.2) assert inferred_qual_score > min(all_observed_qual_scores) assert inferred_qual_score < max(all_observed_qual_scores) # Now, try infer with higher confidence, and make sure that name isn't inferred anymore. confidence = 0.9 infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit, numsamples, order_by) for row in range(5): # TODO: what do missing values look like? these should be missing inferred_name = infer_result['data'][row+5][1] inferred_qual_score = infer_result['data'][row][2] assert numpy.isnan(inferred_name) assert numpy.isnan(inferred_qual_score)
def test_infer(): # TODO: whereclauses test_tablename, _ = create_dha(path='data/dha_missing.csv') # dha_missing has missing qual_score in first 5 rows, and missing name in rows 6-10. engine = Engine(seed=0) engine.initialize_models(test_tablename, 20) functions = bql.bql_statement.parseString('infer name, qual_score from test', parseAll=True).functions whereclause = None limit = float('inf') order_by = False numsamples = 30 confidence = 0 infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit, numsamples, order_by) assert 'column_labels' in infer_result assert 'data' in infer_result assert infer_result['column_labels'] == ['key', 'name', 'qual_score'] # 307 is the total number of rows in the dataset. assert(len(infer_result['data']) == 307 and len(infer_result['data'][0]) == len(infer_result['column_labels'])) assert type(infer_result['data'][0][0]) == numpy.string_ # type of key is int t = type(infer_result['data'][0][1]) assert (t == unicode) or (t == numpy.string_) # type of name is string assert type(infer_result['data'][0][2]) == float # type of qual_score is float all_possible_names = [infer_result['data'][row][1] for row in range(5) + range(10, 307)] all_observed_qual_scores = [infer_result['data'][row][2] for row in range(5, 307)] for row in range(5): inferred_name = infer_result['data'][row+5][1] inferred_qual_score = infer_result['data'][row][2] assert inferred_name in all_possible_names assert type(inferred_qual_score) == type(1.2) assert inferred_qual_score > min(all_observed_qual_scores) assert inferred_qual_score < max(all_observed_qual_scores) # Now, try infer with higher confidence, and make sure that name isn't inferred anymore. confidence = 0.9 infer_result = engine.infer(test_tablename, functions, confidence, whereclause, limit, numsamples, order_by) for row in range(5): # TODO: what do missing values look like? these should be missing inferred_name = infer_result['data'][row+5][1] inferred_qual_score = infer_result['data'][row][2] assert numpy.isnan(inferred_name) assert numpy.isnan(inferred_qual_score)