def test_generic_drop_with_list(self): frame_name1 = str(uuid.uuid1()).replace('-', '_') frame1 = ta.Frame(name=frame_name1) frame_name2 = str(uuid.uuid1()).replace('-', '_') ta.Frame(name=frame_name2) # Create list with frame proxy object and frame name frameList = [frame1, frame_name2] # Check that the frames we just created now exist self.assertTrue( frame_name1 in ta.get_frame_names(), frame_name1 + " should exist in the list of frame names") self.assertTrue( frame_name2 in ta.get_frame_names(), frame_name2 + " should exist in the list of frame names") self.assertEqual( 2, ta.drop(frameList), "drop() should have deleted the 2 items from the list") # Check that the frames no longer exist self.assertFalse( frame_name1 in ta.get_frame_names(), frame_name1 + " should not be in the list of frame names") self.assertFalse( frame_name2 in ta.get_frame_names(), frame_name2 + " should not be in the list of frame names")
def test_generic_drop_duplicate_items(self): frame_name = str(uuid.uuid1()).replace('-','_') frame = ta.Frame(name=frame_name) # Check that the frame we just created now exists self.assertTrue(frame_name in ta.get_frame_names(), frame_name + " should exist in the list of frame names") self.assertEqual(1, ta.drop(frame, frame, frame_name), "drop() should have deleted 1 item") # Check that the frame no longer exists self.assertFalse(frame_name in ta.get_frame_names(), frame_name + " should not be in the list of frame names")
def test_frame_rename(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema= [('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_rename") new_name = "test_frame_new_name" self.assertFalse(new_name in ta.get_frame_names(), "test_frame_new_name should not exist in list of frames") print "renaming frame" frame.name = new_name self.assertTrue(new_name in ta.get_frame_names(), "test_frame_new_name should exist in list of frames")
def test_frame_drop(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema=[('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by entity" ta.drop_frames(frame) frames = ta.get_frame_names() self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames") frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by name" self.assertEqual(1, ta.drop_frames("test_frame_drop"), "drop_frames() should have deleted one frame") self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames")
def test_generic_drop_duplicate_items(self): frame_name = str(uuid.uuid1()).replace('-', '_') frame = ta.Frame(name=frame_name) # Check that the frame we just created now exists self.assertTrue( frame_name in ta.get_frame_names(), frame_name + " should exist in the list of frame names") self.assertEqual(1, ta.drop(frame, frame, frame_name), "drop() should have deleted 1 item") # Check that the frame no longer exists self.assertFalse( frame_name in ta.get_frame_names(), frame_name + " should not be in the list of frame names")
def test_access_refreshes_frames(self): """Tests that some actions do or do not update the last_read_date entity property""" csv = ta.CsvFile("/datasets/dates.csv", schema= [('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') name = "update_last_read" if name in ta.get_frame_names(): ta.drop_frames(name) f = ta.Frame(csv, name=name) # name it, to save it from the other GC blasting test in here t0 = f.last_read_date t1 = f.last_read_date #print "t0=%s" % t0.isoformat() self.assertEqual(t0, t1) f.schema # schema, or other meta data property reads, should not update the last read date t2 = f.last_read_date #print "t2=%s" % t2.isoformat() self.assertEqual(t0, t2) f.inspect() # inspect should update the last read date t3 = f.last_read_date #print "t3=%s" % t3.isoformat() self.assertLess(t2,t3) f.copy() # copy should update the last read date t4 = f.last_read_date #print "t4=%s" % t4.isoformat() self.assertLess(t3,t4) f.bin_column('id', [3, 5, 8]) t5 = f.last_read_date #print "t5=%s" % t5.isoformat() self.assertLess(t4,t5)
def test_generic_drop_with_list(self): frame_name1 = str(uuid.uuid1()).replace('-','_') frame1 = ta.Frame(name=frame_name1) frame_name2 = str(uuid.uuid1()).replace('-','_') ta.Frame(name=frame_name2) # Create list with frame proxy object and frame name frameList = [frame1, frame_name2] # Check that the frames we just created now exist self.assertTrue(frame_name1 in ta.get_frame_names(), frame_name1 + " should exist in the list of frame names") self.assertTrue(frame_name2 in ta.get_frame_names(), frame_name2 + " should exist in the list of frame names") self.assertEqual(2, ta.drop(frameList), "drop() should have deleted the 2 items from the list") # Check that the frames no longer exist self.assertFalse(frame_name1 in ta.get_frame_names(), frame_name1 + " should not be in the list of frame names") self.assertFalse(frame_name2 in ta.get_frame_names(), frame_name2 + " should not be in the list of frame names")
def test_drop_frame_that_does_not_exist(self): frame_name = str(uuid.uuid1()).replace('-', '_') self.assertFalse( frame_name in ta.get_frame_names(), frame_name + " should not exist in the list of frames") self.assertEqual(0, ta.drop_frames(frame_name), "drop_frames() should not have deleted any frames")
def test_generic_drop_by_invalid_name(self): frame_name = str(uuid.uuid1()).replace('-', '_') self.assertTrue(frame_name not in ta.get_frame_names(), frame_name + " should not exist in the list of frames") # Verify that calling drop on a frame that does not exist does not fail self.assertEqual( 0, ta.drop(frame_name), "drop() with a non-existent item name should not have deleted items" )
def test_create_kmeans_model_with_duplicte_frame_name(self): frame_name = str(uuid.uuid1()).replace('-','_') # Create frame ta.Frame(name=frame_name) self.assertTrue(frame_name in ta.get_frame_names(), frame_name + " should be in the list of frames") # Try to create model with the same name as the frame (we expect an exception) with self.assertRaises(Exception): ta.KMeansModel(name=frame_name)
def test_create_kmeans_model_with_duplicte_frame_name(self): frame_name = str(uuid.uuid1()).replace('-', '_') # Create frame ta.Frame(name=frame_name) self.assertTrue(frame_name in ta.get_frame_names(), frame_name + " should be in the list of frames") # Try to create model with the same name as the frame (we expect an exception) with self.assertRaises(Exception): ta.KMeansModel(name=frame_name)
def test_frame_rename(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema=[('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_rename") new_name = "test_frame_new_name" self.assertFalse( new_name in ta.get_frame_names(), "test_frame_new_name should not exist in list of frames") print "renaming frame" frame.name = new_name self.assertTrue(new_name in ta.get_frame_names(), "test_frame_new_name should exist in list of frames")
def test_duplicate_graph_rename(self): graph_name1 = str(uuid.uuid1()).replace('-', '_') graph_name2 = str(uuid.uuid1()).replace('-', '_') model_name = str(uuid.uuid1()).replace('-', '_') frame_name = str(uuid.uuid1()).replace('-', '_') # Create graphs, model, and frame graph1 = ta.Graph(name=graph_name1) graph2 = ta.Graph(name=graph_name2) ta.KMeansModel(name=model_name) ta.Frame(name=frame_name) # After creating graphs, check that graphs with each name exists on the server self.assertTrue(graph_name1 in ta.get_graph_names(), graph_name1 + " should exist in list of graphs") self.assertTrue(graph_name2 in ta.get_graph_names(), graph_name2 + " should exist in list of graphs") # Try to rename graph2 to have the same name as graph1 (we expect an exception here) with self.assertRaises(Exception): graph2.name = graph_name1 # Both graph names should still exist on the server self.assertTrue(graph_name1 in ta.get_graph_names(), graph_name1 + " should still exist in list of graphs") self.assertTrue(graph_name2 in ta.get_graph_names(), graph_name2 + " should still exist in list of graphs") # Try to rename graph1 to have the same name as the frame (we expect an exception here) with self.assertRaises(Exception): graph1.name = frame_name # graph1 and the frame name should still exist on the server self.assertTrue( graph_name1 in ta.get_graph_names(), graph_name1 + " should still exist in the list of graphs") self.assertTrue( frame_name in ta.get_frame_names(), frame_name + " should still exist in the list of frames") # Try to rename graph1 to have the same name as the model (we expect an exception here) with self.assertRaises(Exception): graph1.name = model_name # graph1 and the frame name should still exist on the server self.assertTrue( graph_name1 in ta.get_graph_names(), graph_name1 + " should still exist in the list of graphs") self.assertTrue( model_name in ta.get_model_names(), model_name + " should still exist in the list of models")
def test_gc_drop_stale_and_finalize(self): csv = ta.CsvFile("/datasets/dates.csv", schema=[('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') f2_name = "dates_two" if f2_name in ta.get_frame_names(): ta.drop_frames(f2_name) f1 = ta.Frame(csv) f1e = f1.get_error_frame() self.assertIsNotNone(f1e) self.assertIsNone(f1e.name) f2 = ta.Frame(csv, name=f2_name) f2e = f2.get_error_frame() self.assertIsNotNone(f2e) self.assertIsNone(f2e.name) admin.drop_stale( ) # first, normal drop_stale, nothing should change because these frames aren't old enough self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped( ) # nothing is dropped, so nothing so be finalized self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) admin.drop_stale( "1ms" ) # now drop with very tiny age, so non-name f1 should get dropped self.assertEqual("DROPPED", f1.status) self.assertEqual("DROPPED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped( ) # on f1 and f1e are dropped, so only they should be finalized self.assertEqual("FINALIZED", f1.status) self.assertEqual("FINALIZED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status)
def test_gc_drop_stale_and_finalize(self): csv = ta.CsvFile("/datasets/dates.csv", schema= [('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') f2_name = "dates_two" if f2_name in ta.get_frame_names(): ta.drop_frames(f2_name) f1 = ta.Frame(csv) f1e = f1.get_error_frame() self.assertIsNotNone(f1e) self.assertIsNone(f1e.name) f2 = ta.Frame(csv, name=f2_name) f2e = f2.get_error_frame() self.assertIsNotNone(f2e) self.assertIsNone(f2e.name) admin.drop_stale() # first, normal drop_stale, nothing should change because these frames aren't old enough self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped() # nothing is dropped, so nothing so be finalized self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) admin.drop_stale("1ms") # now drop with very tiny age, so non-name f1 should get dropped self.assertEqual("DROPPED", f1.status) self.assertEqual("DROPPED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped() # on f1 and f1e are dropped, so only they should be finalized self.assertEqual("FINALIZED", f1.status) self.assertEqual("FINALIZED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status)
def get_frame(name): global frame if mode is None or mode == 'local': print('Warning: Not connected to ATK') return if not frame is None: return frame frames = tap.get_frame_names() if name in frames: return tap.get_frame(name) frame = tap.Frame(tap.UploadRows([], schema)) frame.name = name return frame
def test_frame_drop(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema= [('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by entity" ta.drop_frames(frame) frames = ta.get_frame_names() self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames") frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by name" self.assertEqual(1, ta.drop_frames("test_frame_drop"), "drop_frames() should have deleted one frame") self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames")
def test_duplicate_model_rename(self): model_name1 = str(uuid.uuid1()).replace('-','_') model_name2 = str(uuid.uuid1()).replace('-','_') graph_name = str(uuid.uuid1()).replace('-','_') frame_name = str(uuid.uuid1()).replace('-','_') # Create models, graph, and frame to test with model1 = ta.KMeansModel(name=model_name1) model2 = ta.KMeansModel(name=model_name2) ta.Graph(name=graph_name) ta.Frame(name=frame_name) # After creating models, check that models with each name exists on the server self.assertTrue(model_name1 in ta.get_model_names(), model_name1 + " should exist in list of models") self.assertTrue(model_name2 in ta.get_model_names(), model_name2 + " should exist in list of models") # Try to rename model2 to have the same name as model1 (we expect an exception here) with self.assertRaises(Exception): model2.name = model_name1 # Both model names should still exist on the server self.assertTrue(model_name1 in ta.get_model_names(), model_name1 + " should still exist in list of models") self.assertTrue(model_name2 in ta.get_model_names(), model_name2 + " should still exist in list of models") # Try to rename model1 to have the same name as the graph (we expect an exception here) with self.assertRaises(Exception): model1.name = graph_name # model1 and the graph should still exist on the server self.assertTrue(model_name1 in ta.get_model_names(), model_name1 + " should still exist in the list of models") self.assertTrue(graph_name in ta.get_graph_names(), graph_name + " should still exist in the list of graphs") # Try to rename model1 to have the same name as the frame (we expect an exception here) with self.assertRaises(Exception): model1.name = frame_name # model1 and the frame should still exist on the server self.assertTrue(model_name1 in ta.get_model_names(), model_name1 + " should still exist in the list of models") self.assertTrue(frame_name in ta.get_frame_names(), frame_name + " should still exist in the list of frames")
def test_access_refreshes_frames(self): """Tests that some actions do or do not update the last_read_date entity property""" csv = ta.CsvFile("/datasets/dates.csv", schema=[('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') name = "update_last_read" if name in ta.get_frame_names(): ta.drop_frames(name) f = ta.Frame( csv, name=name ) # name it, to save it from the other GC blasting test in here t0 = f.last_read_date t1 = f.last_read_date #print "t0=%s" % t0.isoformat() self.assertEqual(t0, t1) f.schema # schema, or other meta data property reads, should not update the last read date t2 = f.last_read_date #print "t2=%s" % t2.isoformat() self.assertEqual(t0, t2) f.inspect() # inspect should update the last read date t3 = f.last_read_date #print "t3=%s" % t3.isoformat() self.assertLess(t2, t3) f.copy() # copy should update the last read date t4 = f.last_read_date #print "t4=%s" % t4.isoformat() self.assertLess(t3, t4) f.bin_column('id', [3, 5, 8]) t5 = f.last_read_date #print "t5=%s" % t5.isoformat() self.assertLess(t4, t5)
def test_drop_frame_that_does_not_exist(self): frame_name = str(uuid.uuid1()).replace('-','_') self.assertFalse(frame_name in ta.get_frame_names(), frame_name + " should not exist in the list of frames") self.assertEqual(0, ta.drop_frames(frame_name), "drop_frames() should not have deleted any frames")
def testLinearRegression(self): print "define csv file" csv = ta.CsvFile("hdfs://nameservice1/org/intel/hdfsbroker/userspace/9bb351fa-7b17-4a81-b3b0-521639c1d473/d342214b-c4c0-4963-aeaf-5adf054e22b6/000000_1", schema=[ ("GXY",ta.int32), #("HPI",ta.ignore), ("Age",ta.int32), ("Sex",ta.int32), ("Height",ta.float64), ("Weight",ta.float64), ("BMI",ta.float64), ("DBP",ta.float64), ("SBP",ta.float64), ("HCT",ta.float64), ("MCV",ta.float64), ("RDW_SD",ta.float64), ("RDW_CV",ta.float64), ("HGB",ta.float64), ("MCH",ta.float64), ("MCHC",ta.float64), ("RBC",ta.float64), ("WBC",ta.float64), ("NEUT1",ta.float64), ("LYMPH",ta.float64), ("MONO1",ta.float64), ("EO1",ta.float64), ("BASO1",ta.float64), ("NEUT2",ta.float64), ("MONO2",ta.float64), ("EO2",ta.float64), ("BASO2",ta.float64), ("PLT",ta.float64), #("PDW",ta.ignore), ("MPV",ta.float64), ("P_LCR",ta.float64), ("PCT",ta.float64), ("Lymph_3",ta.float64), ("ESR",ta.float64), ("PH",ta.float64), ("PRO",ta.float64), ("GIu",ta.float64), ("KET",ta.float64), ("BLD",ta.float64), ("BIL",ta.float64), ("URO",ta.float64), ("NIT",ta.float64), ("SG",ta.float64), ("LEU",ta.float64), ("N_QT",ta.float64), ("VC",ta.float64), #("ECG",ta.ignore), #("BCJC1",ta.ignore), #("IRDS",ta.ignore), #("WK",ta.ignore), ("OB",ta.float64), ("FBG",ta.float64), ("HBsAg",ta.float64), ("HBsAb",ta.float64), ("HBeAg",ta.float64), ("HBeAb",ta.float64), ("HBcAb",ta.float64), ("TBiL",ta.float64), ("ALT",ta.float64), ("AST",ta.float64), ("AKP",ta.float64), ("GGT",ta.float64), ("ADA",ta.float64), ("TPO",ta.float64), ("Aib",ta.float64), ("Gib",ta.float64), ("A_G",ta.float64), ("PA",ta.float64), ("AST_ALT",ta.float64), ("BUN",ta.float64), ("Cr",ta.float64), ("UA",ta.float64), ("CK",ta.float64), ("LDH",ta.float64), ("CK_MB",ta.float64), ("LDH_MB",ta.float64), ("a_HBD",ta.float64), ("TNI",ta.float64), ("Fg",ta.float64), ("K1",ta.float64), ("AFP",ta.float64), ("CEA",ta.float64), ("Free_PSA",ta.float64), ("CA125",ta.float64), ("CA19_9",ta.float64), ("NSE",ta.float64), ("CA242",ta.float64), ("B_HCG",ta.float64), ("CA15_3",ta.float64), ("CA50",ta.float64), ("CA72_4",ta.float64), ("HGH",ta.float64), ("SF",ta.float64), ("QJD",ta.float64), ("DCJC",ta.float64), ("MJJC",ta.float64), ("RUT",ta.float64), ("PGI_PGII",ta.float64), ("Ca2",ta.float64), ("P3",ta.float64), ("K2",ta.float64), ("Na",ta.float64), ("CI",ta.float64) ], skip_header_lines=1) print "create frame" frame_name = 'Random_forest_SampleFrame' exist_frames = ta.get_frame_names() if frame_name in exist_frames: print "Frame exists, delete it" ta.drop_frames(frame_name) frame = ta.Frame(csv, frame_name) #frame = ta.Frame(csv) print "Initializing a RandomForestModel object" model_name = 'POCRandom_forest_SampleModel' exist_models = ta.get_model_names() if model_name in exist_models: print "Model exist, delete" ta.drop_models(model_name) #model = ta.LinearRegressionModel(name=model_name) classifier = ta.RandomForestClassifierModel(name=model_name) print "Training the model on the Frame" classifier .train(frame,'GXY',['Age','Sex','Height','Weight','BMI','DBP','SBP','HCT','MCV','RDW_SD', 'RDW_CV','HGB','MCH','MCHC','RBC','WBC','NEUT1','LYMPH','MONO1','EO1','BASO1','NEUT2', 'MONO2','EO2','BASO2','PLT','MPV','P_LCR','PCT','Lymph_3','ESR','PH','PRO', 'GIu','KET','BLD','BIL','URO','NIT','SG','LEU','N_QT','VC', 'OB','FBG','HBsAg','HBsAb','HBeAg','HBeAb','HBcAb','TBiL','ALT','AST','AKP','GGT', 'ADA','TPO','Aib','Gib','A_G','PA','AST_ALT','BUN','Cr','UA','CK','LDH','CK_MB', 'LDH_MB','a_HBD','TNI','Fg','K1','AFP','CEA','Free_PSA','CA125','CA19_9','NSE','CA242', 'B_HCG','CA15_3','CA50','CA72_4','HGH','SF','QJD','DCJC','MJJC','RUT','PGI_PGII', 'Ca2','P3','K2','Na','CI'],num_classes=2) print "Predicting on the Frame" output = classifier.predict(frame) self.assertEqual(output.column_names,['GXY','Age','Sex','Height','Weight','BMI','DBP','SBP','HCT', 'MCV','RDW_SD','RDW_CV','HGB','MCH','MCHC','RBC','WBC','NEUT1','LYMPH', 'MONO1','EO1','BASO1','NEUT2','MONO2','EO2','BASO2','PLT','MPV', 'P_LCR','PCT','Lymph_3','ESR','PH','PRO','GIu','KET','BLD','BIL','URO', 'NIT','SG','LEU','N_QT','VC','OB','FBG','HBsAg', 'HBsAb','HBeAg','HBeAb','HBcAb','TBiL','ALT','AST','AKP','GGT','ADA', 'TPO','Aib','Gib','A_G','PA','AST_ALT','BUN','Cr','UA','CK','LDH', 'CK_MB','LDH_MB','a_HBD','TNI','Fg','K1','AFP','CEA','Free_PSA','CA125', 'CA19_9','NSE','CA242','B_HCG','CA15_3','CA50','CA72_4','HGH','SF','QJD', 'DCJC','MJJC','RUT','PGI_PGII','Ca2','P3','K2','Na','CI','predicted_class'])
# you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import trustedanalytics as ta ta.connect() for name in ta.get_frame_names(): print 'deleting frame: %s' % name ta.drop_frames(name) employees_frame = ta.Frame( ta.CsvFile("employees.csv", schema=[('Employee', str), ('Manager', str), ('Title', str), ('Years', ta.int64)], skip_header_lines=1), 'employees_frame') employees_frame.inspect() #A bipartite graph #Notice that this is a funny example since managers are also employees! #Preseuambly Steve the manager and Steve the employee are the same person
# you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import trustedanalytics as ta ta.connect() for name in ta.get_frame_names(): print 'deleting frame: %s' %name ta.drop_frames(name) employees_frame = ta.Frame(ta.CsvFile("employees.csv", schema = [('Employee', str), ('Manager', str), ('Title', str), ('Years', ta.int64)], skip_header_lines=1), 'employees_frame') employees_frame.inspect() #A bipartite graph #Notice that this is a funny example since managers are also employees! #Preseuambly Steve the manager and Steve the employee are the same person #Option 1 graph = ta.Graph()
def test_generic_drop_by_invalid_name(self): frame_name = str(uuid.uuid1()).replace('-','_') self.assertTrue(frame_name not in ta.get_frame_names(), frame_name + " should not exist in the list of frames") # Verify that calling drop on a frame that does not exist does not fail self.assertEqual(0, ta.drop(frame_name), "drop() with a non-existent item name should not have deleted items")
def test_naive_bayes(self): print "define csv file" csv = ta.CsvFile("hdfs://nameservice1/org/intel/hdfsbroker/userspace/ae6a38d3-191f-494f-86a6-3fe1b2255902/e3327582-f475-4dc9-8efa-96070abb606d/000000_1", schema=[ ("GXY",ta.int32), #("HPI",ta.ignore), ("Age",ta.int32), ("Sex",ta.int32), ("Height",ta.float64), ("Weight",ta.float64), ("BMI",ta.float64), ("DBP",ta.float64), ("SBP",ta.float64), ("HCT",ta.float64), ("MCV",ta.float64), ("RDW_SD",ta.float64), ("RDW_CV",ta.float64), ("HGB",ta.float64), ("MCH",ta.float64), ("MCHC",ta.float64), ("RBC",ta.float64), ("WBC",ta.float64), ("NEUT1",ta.float64), ("LYMPH",ta.float64), ("MONO1",ta.float64), ("EO1",ta.float64), ("BASO1",ta.float64), ("NEUT2",ta.float64), ("MONO2",ta.float64), ("EO2",ta.float64), ("BASO2",ta.float64), ("PLT",ta.float64), #("PDW",ta.ignore), ("MPV",ta.float64), ("P_LCR",ta.float64), ("PCT",ta.float64), ("Lymph_3",ta.float64), ("ESR",ta.float64), ("PH",ta.float64), ("PRO",ta.float64), ("GIu",ta.float64), ("KET",ta.float64), ("BLD",ta.float64), ("BIL",ta.float64), ("URO",ta.float64), ("NIT",ta.float64), ("SG",ta.float64), ("LEU",ta.float64), ("N_QT",ta.float64), ("VC",ta.float64), #("ECG",ta.ignore), #("BCJC1",ta.ignore), #("IRDS",ta.ignore), #("WK",ta.ignore), ("OB",ta.float64), ("FBG",ta.float64), ("HBsAg",ta.float64), ("HBsAb",ta.float64), ("HBeAg",ta.float64), ("HBeAb",ta.float64), ("HBcAb",ta.float64), ("TBiL",ta.float64), ("ALT",ta.float64), ("AST",ta.float64), ("AKP",ta.float64), ("GGT",ta.float64), ("ADA",ta.float64), ("TPO",ta.float64), ("Aib",ta.float64), ("Gib",ta.float64), ("A_G",ta.float64), ("PA",ta.float64), ("AST_ALT",ta.float64), ("BUN",ta.float64), ("Cr",ta.float64), ("UA",ta.float64), ("CK",ta.float64), ("LDH",ta.float64), ("CK_MB",ta.float64), ("LDH_MB",ta.float64), ("a_HBD",ta.float64), ("TNI",ta.float64), ("Fg",ta.float64), ("K1",ta.float64), ("AFP",ta.float64), ("CEA",ta.float64), ("Free_PSA",ta.float64), ("CA125",ta.float64), ("CA19_9",ta.float64), ("NSE",ta.float64), ("CA242",ta.float64), ("B_HCG",ta.float64), ("CA15_3",ta.float64), ("CA50",ta.float64), ("CA72_4",ta.float64), ("HGH",ta.float64), ("SF",ta.float64), ("QJD",ta.float64), ("DCJC",ta.float64), ("MJJC",ta.float64), ("RUT",ta.float64), ("PGI_PGII",ta.float64), ("Ca2",ta.float64), ("P3",ta.float64), ("K2",ta.float64), ("Na",ta.float64), ("CI",ta.float64) ], skip_header_lines=1) print "create frame" frame_name = 'ModelNaiveBayesFrame' exist_frames = ta.get_frame_names() if frame_name in exist_frames: print "Frame exists, delete it" ta.drop_frames(frame_name) train_frame = ta.Frame(csv, frame_name) print "Initializing a RandomForestModel object" model_name = 'POCModelNaiveBayesModel' exist_models = ta.get_model_names() if model_name in exist_models: print "Model exist, delete" ta.drop_models(model_name) naive = ta.NaiveBayesModel(name=model_name) print "Training the model on the Frame" naive.train(train_frame,'GXY',['Age','Sex','Height','Weight','BMI','DBP','SBP','HCT','MCV','RDW_SD', 'RDW_CV','HGB','MCH','MCHC','RBC','WBC','NEUT1','LYMPH','MONO1','EO1','BASO1','NEUT2', 'MONO2','EO2','BASO2','PLT','MPV','P_LCR','PCT','Lymph_3','ESR','PH','PRO', 'GIu','KET','BLD','BIL','URO','NIT','SG','LEU','N_QT','VC', 'OB','FBG','HBsAg','HBsAb','HBeAg','HBeAb','HBcAb','TBiL','ALT','AST','AKP','GGT', 'ADA','TPO','Aib','Gib','A_G','PA','AST_ALT','BUN','Cr','UA','CK','LDH','CK_MB', 'LDH_MB','a_HBD','TNI','Fg','K1','AFP','CEA','Free_PSA','CA125','CA19_9','NSE','CA242', 'B_HCG','CA15_3','CA50','CA72_4','HGH','SF','QJD','DCJC','MJJC','RUT','PGI_PGII', 'Ca2','P3','K2','Na','CI'],num_classes=2) print "Predicting on the Frame" output = naive.predict(train_frame) self.assertEqual(output.column_names,['GXY','Age','Sex','Height','Weight','BMI','DBP','SBP','HCT', 'MCV','RDW_SD','RDW_CV','HGB','MCH','MCHC','RBC','WBC','NEUT1','LYMPH', 'MONO1','EO1','BASO1','NEUT2','MONO2','EO2','BASO2','PLT','MPV', 'P_LCR','PCT','Lymph_3','ESR','PH','PRO','GIu','KET','BLD','BIL','URO', 'NIT','SG','LEU','N_QT','VC','OB','FBG','HBsAg', 'HBsAb','HBeAg','HBeAb','HBcAb','TBiL','ALT','AST','AKP','GGT','ADA', 'TPO','Aib','Gib','A_G','PA','AST_ALT','BUN','Cr','UA','CK','LDH', 'CK_MB','LDH_MB','a_HBD','TNI','Fg','K1','AFP','CEA','Free_PSA','CA125', 'CA19_9','NSE','CA242','B_HCG','CA15_3','CA50','CA72_4','HGH','SF','QJD', 'DCJC','MJJC','RUT','PGI_PGII','Ca2','P3','K2','Na','CI','predicted_class'])
("DBP",ta.float64), ("Cr",ta.int32), ("HCT",ta.float64) ], skip_header_lines=1); # In[27]: #create frame #frame_name = "myframe"; #if frame_name in ta.get_frame_names(): #ta.drop_frames(frame_name) frame_name = 'myframe' exist_frames = ta.get_frame_names() if frame_name in exist_frames: print "Frame exists, delete it" ta.drop_frames(frame_name) my_frame = ta.Frame(csv, frame_name) my_frame.inspect(21) # In[31]: #feature classify def transformation_DBP(row): #<60一组,60~90每10mmHg一组,≥90一组 dbp = row.DBP