def test_flatten_columns_with_strings_and_vectors_with_default_delimiter(self): data = [[1,"1,2",[1,2],"a,b"],[2,"3,4",[3,4],"c,d"],[3,"5,6",[5,6],"e,f"],[4,"7,8",[7,8],"g,h"]] schema = [('a', ta.int32),('b', str), ('c', ta.vector(2)), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data,schema)) # there are only 2 string columns. giving 3 delimiters should give an exception. with self.assertRaises(Exception): test_frame.flatten_columns(['b', 'c', 'd'], [',',',',',']) test_frame.flatten_columns(['b', 'c', 'd']) # expected data after flattening expected_data = [ [1,"1",1.0,"a"], [1,"2",2.0,"b"], [2,"3",3.0,"c"], [2,"4",4.0,"d"], [3,"5",5.0,"e"], [3,"6",6.0,"f"], [4,"7",7.0,"g"], [4,"8",8.0,"h"] ] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_adf_column_types(self): """ Tests the Augmented Dickey-Fuller test with different column types """ data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0], [7, "e", 25.6], [8, "f", 36.75]] schema = [("int_column", ta.int32), ("str_column", str), ("float_column", ta.float32)] frame = ta.Frame(ta.UploadRows(data, schema)) try: # string column should have an error frame.timeseries_augmented_dickey_fuller_test("str_column", 0) raise RuntimeError( "Expected error since the str_column is not numerical.") except Exception as e: assert ("Column str_column was not numerical" in e.message) # Numerical columns should not have an error self.assertNotEqual( frame.timeseries_augmented_dickey_fuller_test("int_column", 0), None) self.assertNotEqual( frame.timeseries_augmented_dickey_fuller_test("float_column", 0), None)
def test_flatten_column_with_differing_size_vectors(self): data = [[1,[1,2,3],[8,7]],[2,[4,5,6],[6,5]],[3,[7,8,9],[4,3]],[4,[10,11,12],[2,1]]] schema = [('a', ta.int32), ('b', ta.vector(3)), ('c', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data,schema)) test_frame.flatten_columns(['b','c']) # expected data after flattening expected_data = [ [1,1.0,8.0], [1,2.0,7.0], [1,3.0,0.0], [2,4.0,6.0], [2,5.0,5.0], [2,6.0,0.0], [3,7.0,4.0], [3,8.0,3.0], [3,9.0,0.0], [4,10.0,2.0], [4,11.0,1.0], [4,12.0,0.0] ] self.assertEqual(test_frame.row_count, 12) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_bpt_invalid_column(self): """ Tests the Breusch-Pagan test with non-numerical data, and expects an error """ data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0], [7, "e", 25.6], [8, "f", 36.75]] schema = [("int_column", ta.int32), ("str_column", str), ("float_column", ta.float32)] frame = ta.Frame(ta.UploadRows(data, schema)) try: frame.timeseries_breusch_pagan_test("str_column", ["int_column", "float_column"]) raise RuntimeError( "Expected error since the y column specified has strings") except Exception as e: assert ("Column str_column was not numerical" in e.message) try: frame.timeseries_breusch_pagan_test("float_column", ["int_column", "str_column"]) raise RuntimeError( "Expected error since one of the x columns specified has strings." ) except Exception as e: assert ("Column str_column was not numerical" in e.message) # numerical data should not have an error self.assertNotEqual( frame.timeseries_breusch_pagan_test("float_column", ["int_column"]), None)
def test_frame_upload_raw_list_data(self): """does round trip with list data --> upload to frame --> 'take' back to list and compare""" data = [[1, 'one', [1.0, 1.1]], [2, 'two', [2.0, 2.2]], [3, 'three', [3.0, 3.3]]] schema = [('n', int), ('s', str), ('v', ta.vector(2))] frame = ta.Frame(ta.UploadRows(data, schema)) taken = frame.take(5) self.assertEqual(len(data), len(taken)) for r, row in enumerate(taken): self.assertEqual(len(data[r]), len(row)) for c, column in enumerate(row): self.assertEqual(data[r][c], column)
def test_kmeans_train_publish(self): frame = ta.Frame( ta.UploadRows( [[2, "ab"], [1, "cd"], [7, "ef"], [1, "gh"], [9, "ij"], [2, "kl"], [0, "mn"], [6, "op"], [5, "qr"]], [("data", ta.float64), ("name", str)])) model = ta.KMeansModel() train_output = model.train(frame, ["data"], [1], 3) self.assertTrue( train_output.has_key('within_set_sum_of_squared_error')) model.publish()
def test_flatten_columns_with_single_vector(self): data = [[1, [1, 2]], [2, [3, 4]], [3, [5, 6]], [4, [7, 8]]] schema = [('a', ta.int32), ('b', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns('b') # expected data after flattening expected_data = [[1, 1.0], [1, 2.0], [2, 3.0], [2, 4.0], [3, 5.0], [3, 6.0], [4, 7.0], [4, 8.0]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_missing_values_drop_rows(self): # Create frame with missing values using upload rows schema = [('a', ta.int32)] data = [[1], [4], [None], [None], [10], [None]] frame = ta.Frame(ta.UploadRows(data, schema)) # Check that frame was correctly created self.assertEqual(6, frame.row_count) self.assertEqual(data, frame.take(frame.row_count)) # Check that we can drop rows with missing values frame.drop_rows(lambda row: row['a'] == None) expected = [[1], [4], [10]] self.assertEqual(expected, frame.take(frame.row_count, columns='a'))
def test_flatten_columns_with_multiple_vectors(self): data = [[1, [1, 2], [8, 7]], [2, [3, 4], [6, 5]], [3, [5, 6], [4, 3]], [4, [7, 8], [2, 1]]] schema = [('a', ta.int32), ('b', ta.vector(2)), ('c', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns(['b', 'c']) # expected data after flattening expected_data = [[1, 1.0, 8.0], [1, 2.0, 7.0], [2, 3.0, 6.0], [2, 4.0, 5.0], [3, 5.0, 4.0], [3, 6.0, 3.0], [4, 7.0, 2.0], [4, 8.0, 1.0]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def upload_rows(frame_name, names, rows): if mode is None or mode == 'local': print('Warning: Not connected to ATK') return rs = [None] * len(rows) * 100 k = 0 l = 0 for r in rows: for j in range(len(r)): rs[l] = [[r[j]['DateTime'], names[k], r[j]['Value']]] l = l + 1 k = k + 1 rs1 = [r1 for r1 in rs if r1 != None] f = get_frame(frame_name) f.append(tap.UploadRows(rs, schema))
def test_flatten_columns_with_strings_and_vectors_with_one_delimiter(self): data = [[1, "1:2", [1, 2], "a:b"], [2, "3:4", [3, 4], "c:d"], [3, "5:6", [5, 6], "e:f"], [4, "7:8", [7, 8], "g:h"]] schema = [('a', ta.int32), ('b', str), ('c', ta.vector(2)), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns(['b', 'c', 'd'], ':') # expected data after flattening expected_data = [[1, "1", 1.0, "a"], [1, "2", 2.0, "b"], [2, "3", 3.0, "c"], [2, "4", 4.0, "d"], [3, "5", 5.0, "e"], [3, "6", 6.0, "f"], [4, "7", 7.0, "g"], [4, "8", 8.0, "h"]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_flatten_columns_with_mismatch_delimiter_count(self): # we need a frame with more than three columns for this test data = [[1,"solo,mono,single","a,b,c","1+2+3"],[2,"duo,double","d,e","4+5"]] schema = [('a',ta.int32), ('b', str), ('c', str), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data,schema)) # when providing more than one delimiter, count must match column count # too few delimiters should throw an exception with self.assertRaises(Exception): test_frame.flatten_columns(['b','c','d'],[',',',']) # too many delimiters should also throw an exception with self.assertRaises(Exception): test_frame.flatten_columns(['b','c','d'],[',',',','+','|']) # giving just one delimiter means that the same delimiter is used for all columns test_frame.flatten_columns(['b','c'], ',') self.assertEqual(test_frame.row_count, 5)
def get_frame(name): global frame if mode is None or mode == 'local': print('Warning: Not connected to ATK') return if not frame is None: return frame frames = tap.get_frame_names() if name in frames: return tap.get_frame(name) frame = tap.Frame(tap.UploadRows([], schema)) frame.name = name return frame
def test_adf_params(self): """ Test the Augmented Dickey-Fuller test with invalid parameters """ data = [[12.88969427], [13.54964408], [13.8432745], [12.13843611], [12.81156092], [14.2499628], [15.12102595]] frame = ta.Frame(ta.UploadRows(data, [("data", ta.float32)])) # Test calling ADF test with and without regression parameter self.assertNotEqual( frame.timeseries_augmented_dickey_fuller_test("data", 0), None) self.assertNotEqual( frame.timeseries_augmented_dickey_fuller_test("data", 0, "c"), None) try: frame.timeseries_augmented_dickey_fuller_test("data", 0, "bogus") except Exception as e: assert ("bogus is not c, ct, or ctt" in e.message)
def test_dwtest_column_types(self): """ Tests that the Durbin-Watson test only works with numerical columns """ data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0], [7, "e", 25.6], [8, "f", 36.75]] schema = [("int_column", ta.int32), ("str_column", str), ("float_column", ta.float32)] frame = ta.Frame(ta.UploadRows(data, schema)) try: # calling durbin-watson with a string column should fail frame.timeseries_durbin_watson_test("str_column") raise RuntimeError( "Expected error since the column must be numerical") except Exception as e: assert ("Column str_column was not numerical" in e.message) # int and float columns should not give any error self.assertNotEqual(frame.timeseries_durbin_watson_test("int_column"), None) self.assertNotEqual( frame.timeseries_durbin_watson_test("float_column"), None)
def test_missing_values_add_column(self): # Create frame with missing values using upload rows schema = [('a', ta.int32)] data = [[1], [4], [None], [None], [10], [None]] frame = ta.Frame(ta.UploadRows(data, schema)) # Check that frame was correctly created self.assertEqual(6, frame.row_count) self.assertEqual(data, frame.take(frame.row_count)) # Define function that replaces missing values with zero def noneToZero(x): if x is None: return 0 else: return x # Use add columns to create a new column that replaces missing values with 0. frame.add_columns(lambda row: noneToZero(row['a']), ('a_corrected', ta.int32), columns_accessed='a') expected = [[1], [4], [0], [0], [10], [0]] self.assertEqual(expected, frame.take(frame.row_count, columns='a_corrected'))