def test_query_multiple_partitions(self): df = ec.query_measurements_original( { 'company': ['Company-1'], 'site': ['Site-1'], 'device_group': ['1000'], 'tester': ['Station-1'], 'test_name': ['Test-1'] }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(2, df.count()) df = ec.query_measurements_original( { 'company': ['Company-1', 'Company-2'], 'site': ['Site-1'], 'device_group': ['1000'], 'tester': ['Station-1'], 'test_name': ['Test-1'] }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(3, df.count()) df = ec.query_measurements_original( { 'company': ['Company-1'], 'site': ['Site-1'], 'device_group': ['1000'], 'tester': ['Station-1', 'Station-3'], 'test_name': ['Test-1'] }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(3, df.count()) df = ec.query_measurements_original( { 'company': ['Company-1', 'Company-2'], 'site': ['Site-1'], 'device_group': ['1000'], 'tester': ['Station-1', 'Station-3'], 'test_name': ['Test-1'] }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(4, df.count()) df = ec.query_measurements_original( { 'company': ['Company-1', 'Company-2'], 'site': ['Site-1'], 'device_group': ['1000'], 'tester': ['Station-1', 'Station-3'], 'test_name': ['Test-1', 'Test-3'] }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(5, df.count())
def test_query_array(self): df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1', 'test_name': 'Test-5' }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(1, df.count()) head = df.head() self.assertEqual('Company-1', head.company) self.assertEqual('Site-1', head.site) self.assertEqual('1000', head.device_group) self.assertEqual('Station-1', head.tester) self.assertEqual(ts[4], head.ts) self.assertEqual('101001', head.device_name) self.assertEqual('Test-5', head.test_name) self.assertEqual('Meas-2', head.meas_name) self.assertTrue( numpy.array_equal(numpy.array([0.1111, 0.2222, 0.3333, 0.4444]), head.meas_value)) self.assertEqual('V', head.meas_unit) self.assertEqual( 'PASS', head.meas_status, ) self.assertIsNone(head.meas_lower_limit) self.assertIsNone(head.meas_upper_limit) self.assertEqual('Description', head.meas_description) self.assertEqual('PASS', head.device_status) self.assertEqual('PASS', head.test_status)
def test_query_double(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'station': 'Station-1', 'sensor': 'Sensor-1', 'meas_name': 'Meas-1'}, ts[0], ts[0] + timedelta(seconds=0.5) ) self.assertEqual(1, df.count()) self.assertEqualRows( SensorMeasurement( 'Company-1', 'Site-1', 'Station-1', 'Sensor-1', ts[0], 'Event-1', 'Meas-1', None, 45.7, 'degree C', 'PASS', 40.0, 90.0, 'Description'), df.head())
def test_query_binary(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'station': 'Station-1', 'sensor': 'Sensor-1', 'meas_name': 'Meas-5'}, ts[0], ts[4] + timedelta(seconds=0.5) ) self.assertEqual(1, df.count()) head = df.head() self.assertEqual('Company-1', head.company) self.assertEqual('Site-1', head.site) self.assertEqual('Station-1', head.station) self.assertEqual('Sensor-1', head.sensor) self.assertEqual(ts[4], head.ts) self.assertEqual('Event-1', head.event) self.assertEqual('Meas-5', head.meas_name) self.assertTrue(numpy.array_equal(numpy.array( [0.5555, 0.6666, 0.7777, 0.8888, 0.9999]), head.meas_value)) self.assertEqual('V', head.meas_unit) self.assertEqual('PASS', head.meas_status) self.assertIsNone(head.meas_lower_limit) self.assertIsNone(head.meas_upper_limit) self.assertEqual('Description', head.meas_description)
def test_query_string(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'station': 'Station-1', 'sensor': 'Sensor-1', 'meas_name': 'Meas-4'}, ts[0], ts[4] + timedelta(seconds=0.5) ) self.assertEqual(1, df.count()) self.assertEqualRows( SensorMeasurement( 'Company-1', 'Site-1', 'Station-1', 'Sensor-1', ts[3], 'Event-1', 'Meas-4', None, 'POWER ON', None, 'PASS', None, None, 'Description'), df.head())
def test_quartile_empty(self): # Test calling quartile on an empty DataFrame. df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-NONE'}, # No data for Station-NONE ts[0], ts[14] + timedelta(seconds=0.5)).toPandas() self.assertEqual(0, outliers(df, 'meas_value', 'quartile').shape[0])
def test_query_double(self): df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1', 'test_name': 'Test-1' }, ts[0], ts[0] + timedelta(seconds=0.5)) self.assertEqual(1, df.count()) self.assertEqualRows( AutomatedTest('Company-1', 'Site-1', '1000', 'Station-1', ts[0], '100001', 'Test-1', 'Meas-1', None, 45.7, 'degree C', 'PASS', 40.0, 90.0, 'Description', 'PASS', 'PASS'), df.head())
def test_query_string(self): df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1', 'test_name': 'Test-4' }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(1, df.count()) self.assertEqualRows( AutomatedTest('Company-1', 'Site-1', '1000', 'Station-1', ts[3], '101001', 'Test-4', 'Meas-2', None, 'POWER ON', None, 'PASS', None, None, 'Description', 'PASS', 'PASS'), df.head())
def test_query_int(self): df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1', 'test_name': 'Test-3' }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(1, df.count()) large = 3448388841 self.assertEqualRows( AutomatedTest('Company-1', 'Site-1', '1000', 'Station-1', ts[2], '101001', 'Test-3', 'Meas-2', None, large, 'ns', 'PASS', large - 1, large + 1, 'Description', 'PASS', 'PASS'), df.head())
def test_query_waveform(self): df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1', 'test_name': 'Test-6' }, ts[0], ts[5] + timedelta(seconds=0.5)) self.assertEqual(1, df.count()) self.assertEqualRows( AutomatedTest( 'Company-1', 'Site-1', '1000', 'Station-1', ts[5], '101001', 'Test-6', 'Meas-2', None, Waveform(ts[5], 0.1234, numpy.array([0.5678, 0.9012, 0.3456])), 'V', 'PASS', None, None, 'Description', 'PASS', 'PASS'), df.head())
def test_insufficient_meas_dist(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1'}, ts[0], # Omit the second Meas-2 # measurement. ts[3] + timedelta(seconds=0.5)) # With the second Meas-2 measurement ommitted there won't be enough # measurements to perform IMR. with self.assertRaises(Py4JJavaError): IMR(df).retrieve() # IMR on the first measurement only is fine. IMR(df, 'Meas-1').retrieve()
def test_selected_meas_dist(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1'}, ts[0], ts[4] + timedelta(seconds=0.5)) # Filtering by meas_name before IMR and within IMR are equivalent. self.assertEqualDataFrames(IMR(df.filter(df.meas_name == 'Meas-1')), IMR(df, ['Meas-1'])) # Filtering that matches all the meas_names is the same as no # filtering. self.assertEqualDataFrames(IMR(df), IMR(df, ['Meas-1', 'Meas-2'])) # Filtering with a single name is also supported. self.assertEqualDataFrames(IMR(df.filter(df.meas_name == 'Meas-1')), IMR(df, 'Meas-1'))
def test_all_meas_dist(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1'}, ts[0], ts[4] + timedelta(seconds=0.5)) imr = IMR(df).toPandas().drop('ts', 1) # Compare without 'ts' column due to time representation inconsistencies # between systems. self.assertEqual( ' company site device_group tester device_name test_name meas_name meas_datatype meas_value meas_unit meas_status meas_lower_limit meas_upper_limit meas_description device_status test_status I I_mean I_LCL I_UCL MR MR_mean MR_LCL MR_UCL\n' '0 Company-1 Site-1 1000 Station-1 100001 Test-1 Meas-1 None 45.7 degree C PASS 40.0 90.0 Description PASS PASS 45.7 47.866667 -79.458667 175.192 NaN 1.85 0.0 7.89395\n' '1 Company-1 Site-1 1000 Station-1 101001 Test-1 Meas-1 None 49.1 degree C PASS 40.0 90.0 Description PASS PASS 49.1 47.866667 -79.458667 175.192 3.4 1.85 0.0 7.89395\n' '2 Company-1 Site-1 1000 Station-1 101001 Test-1 Meas-1 None 48.8 degree C PASS 40.0 90.0 Description PASS PASS 48.8 47.866667 -79.458667 175.192 0.3 1.85 0.0 7.89395\n' '3 Company-1 Site-1 1000 Station-1 101001 Test-1 Meas-2 None 88.8 degree C PASS 40.0 90.0 Description PASS PASS 88.8 83.200000 -138.112000 304.512 NaN 11.20 0.0 47.79040\n' '4 Company-1 Site-1 1000 Station-1 101001 Test-1 Meas-2 None 77.6 degree C PASS 40.0 90.0 Description PASS PASS 77.6 83.200000 -138.112000 304.512 11.2 11.20 0.0 47.79040', imr.to_string())
def test_single_meas_dist(self): df = ec.query_measurements_original({'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1', 'meas_name': 'Meas-1'}, ts[0], ts[4]) imr = IMR(df) self.assertEqual([45.7, 49.1, 48.8], imr.toPandas( ).loc[:, 'meas_value'].values.tolist()) self.assertEqual( [45.7, 49.1, 48.8], imr.toPandas().loc[:, 'I'].values.tolist()) i_mean = (45.7 + 49.1 + 48.8) / 3.0 self.assertEqual( [i_mean] * 3, imr.toPandas().loc[:, 'I_mean'].values.tolist()) i_lcl = i_mean - 2.66 * i_mean self.assertEqual( [i_lcl] * 3, imr.toPandas().loc[:, 'I_LCL'].values.tolist()) i_ucl = i_mean + 2.66 * i_mean self.assertEqual( [i_ucl] * 3, imr.toPandas().loc[:, 'I_UCL'].values.tolist()) self.assertEqual([49.1 - 45.7, 49.1 - 48.8], imr.toPandas().loc[:, 'MR'].values.tolist()[1:]) mr_mean = (49.1 - 45.7 + 49.1 - 48.8) / 2.0 self.assertEqual( [mr_mean] * 3, imr.toPandas().loc[ :, 'MR_mean'].values.tolist()) mr_lcl = 0.0 self.assertEqual( [mr_lcl] * 3, imr.toPandas().loc[ :, 'MR_LCL'].values.tolist()) mr_ucl = mr_mean + 3.267 * mr_mean self.assertEqual( [mr_ucl] * 3, imr.toPandas().loc[ :, 'MR_UCL'].values.tolist())
def test_quartile(self): df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-2'}, ts[0], ts[14] + timedelta( seconds=0.5)).toPandas() expected = df.ix[10:14, :] expected.loc[ :, 'meas_flag'] = [ 'mild', 'extreme', 'mild', 'extreme'] expected.loc[:, 'meas_method'] = ['quartile'] self.assertEqual(expected.to_string(), outliers(df, 'meas_value', 'quartile').to_string())
class DataFrameTests(Base): start_time = datetime.fromtimestamp(1428004316.123) end_time = datetime.fromtimestamp(1428005326.163) df = ec.query_measurements_original( { 'company': 'Company-1', 'site': 'Site-1', 'device_group': '1000', 'tester': 'Station-1' }, start_time, end_time) def test_cache(self): # Not testing caching, just the return value. self.assertEqualDataFrames(self.df, self.df.cache()) def test_count(self): self.assertEqual(6, self.df.count()) def test_describe(self): self.assertEqual( ' meas_value meas_lower_limit meas_upper_limit\n' 'count 6.0 3.000000e+00 3.000000e+00\n' 'mean NaN 1.149463e+09 1.149463e+09\n' 'std NaN 1.990928e+09 1.990928e+09\n' 'min NaN 4.000000e+01 9.000000e+01\n' 'max NaN 3.448389e+09 3.448389e+09', self.df.describe().to_string()) self.assertEqual( ' meas_value meas_lower_limit meas_upper_limit\n' 'count 2.000000 2.0 2.0\n' 'mean 47.400000 40.0 90.0\n' 'std 2.404163 0.0 0.0\n' 'min 45.700000 40.0 90.0\n' 'max 49.100000 40.0 90.0', self.df.filter( self.df.test_name == 'Test-1').describe().to_string()) def test_describe_empty(self): empty_df = self.df.filter(self.df.meas_name == 'MISSING_NAME') self.assertEqual( ' meas_value meas_lower_limit meas_upper_limit\n' 'count 0.0 0.0 0.0\n' 'mean NaN NaN NaN\n' 'std NaN NaN NaN\n' 'min NaN NaN NaN\n' 'max NaN NaN NaN', empty_df.describe().to_string()) def test_describe_by_group(self): self.assertEqual( ' Test-1.meas_value Test-1.meas_lower_limit Test-1.meas_upper_limit Test-3.meas_value Test-3.meas_lower_limit Test-3.meas_upper_limit Test-4.meas_value Test-4.meas_lower_limit Test-4.meas_upper_limit Test-5.meas_value Test-5.meas_lower_limit Test-5.meas_upper_limit Test-6.meas_value Test-6.meas_lower_limit Test-6.meas_upper_limit\n' 'count 2.000000 2.0 2.0 1 1 1 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0\n' 'mean 47.400000 40.0 90.0 3.44839e+09 3.44839e+09 3.44839e+09 NaN NaN NaN NaN NaN NaN NaN NaN NaN\n' 'std 2.404163 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n' 'min 45.700000 40.0 90.0 3448388841 3448388840 3448388842 NaN NaN NaN NaN NaN NaN NaN NaN NaN\n' 'max 49.100000 40.0 90.0 3448388841 3448388840 3448388842 NaN NaN NaN NaN NaN NaN NaN NaN NaN', self.df.describe_by_group( 'test_name', ['meas_value', 'meas_lower_limit', 'meas_upper_limit' ]).to_string()) def test_describe_by_group_empty(self): empty_df = self.df.filter(self.df.meas_name == 'MISSING_NAME') self.assertEqual( 'Empty DataFrame\nColumns: []\nIndex: [count, mean, std, min, max]', empty_df.describe_by_group( 'test_name', ['meas_value', 'meas_lower_limit', 'meas_upper_limit' ]).to_string()) def test_distinct(self): self.assertEqual( set(['Test-1', 'Test-3', 'Test-4', 'Test-5', 'Test-6']), set( self.df.select( 'test_name').distinct().toPandas().ix[:, 0].tolist())) def test_filter(self): self.assertEqual(2, self.df.filter(self.df.test_name == 'Test-1').count()) self.assertEqualDataFrames( self.df.filter(self.df.test_name == 'Test-1'), self.df[self.df.test_name == 'Test-1']) self.assertEqualDataFrames( self.df.filter(self.df.test_name == 'Test-1'), self.df.filter("test_name = 'Test-1'")) self.assertEqual(1, self.df.filter(self.df.test_name == 'Test-5').count()) self.assertEqual(5, self.df.filter(self.df.test_name != 'Test-5').count()) def test_get_device_data(self): self.assertEqual( self.df.filter(self.df.device_name == '101001').toPandas().sort( 'ts').to_string(), self.df.get_device_data('101001').to_string()) def test_get_device_data_without_ts(self): # Test without a 'ts' timestamp field to sort by. df_device = self.df.select('device_name') self.assertEqual( df_device.filter( df_device.device_name == '101001').toPandas().to_string(), df_device.get_device_data('101001').to_string()) def test_get_device_data_missing(self): # Test calling get_device_data on a DataFrame missing a device_name # field. df = self.df.select('meas_value') with self.assertRaises(KeyError): df.get_device_data('101001') def test_get_meas_data(self): self.assertEqual( self.df.filter(self.df.meas_name == 'Meas-2').toPandas().sort( 'ts').to_string(), self.df.get_meas_data('Meas-2').to_string()) def test_get_meas_data_without_ts(self): # Test without a 'ts' timestamp field to sort by. df_meas = self.df.select('meas_name') self.assertEqual( df_meas.filter( df_meas.meas_name == 'Meas-2').toPandas().to_string(), df_meas.get_meas_data('Meas-2').to_string()) def test_get_meas_data_missing(self): # Test calling get_meas_data on a DataFrame missing a meas_name field. df = self.df.select('meas_value') with self.assertRaises(KeyError): df.get_meas_data('Meas-2') def test_head(self): self.assertEqualRows( AutomatedTest('Company-1', 'Site-1', '1000', 'Station-1', ts[0], '100001', 'Test-1', 'Meas-1', None, 45.7, 'degree C', 'PASS', 40.0, 90.0, 'Description', 'PASS', 'PASS'), self.df.head()) self.assertEqualRows( AutomatedTest('Company-1', 'Site-1', '1000', 'Station-1', ts[1], '101001', 'Test-1', 'Meas-2', None, 49.1, 'degree C', 'PASS', 40.0, 90.0, 'Description', 'PASS', 'PASS'), self.df.head(2)[1]) self.assertEqual(3, len(self.df.head(3))) def test_limit(self): self.assertEqualRows(self.df.head(), self.df.limit(1).retrieve()[0]) self.assertEqual(3, self.df.limit(3).count()) for i, j in zip(self.df.head(3), self.df.limit(3).retrieve()): self.assertEqualRows(i, j) def test_select(self): self.assertEqualRows(Row(meas_description='Description'), self.df.select('meas_description').head()) self.assertEqualRows(Row(meas_description='Description'), self.df[['meas_description']].head()) self.assertEqualRows( Row(meas_description='Description', meas_status='PASS'), self.df.select('meas_description', 'meas_status').head()) self.assertEqualRows( Row(meas_description='Description', meas_status='PASS'), self.df[['meas_description', 'meas_status']].head()) def test_sort(self): self.assertEqual(ts[5], self.df.sort(self.df.ts.desc()).head().ts) def test_show(self): self.df.show() def test_toPandas(self): self.assertEqual(6, len(self.df.toPandas().index)) values = self.df.toPandas()['meas_value'].tolist() self.assertEqual(45.7, values[0]) self.assertEqual(49.1, values[1]) self.assertEqual(3448388841, values[2]) self.assertEqual('POWER ON', values[3]) self.assertTrue( numpy.array_equal(numpy.array([0.1111, 0.2222, 0.3333, 0.4444]), values[4])) self.assertEqual( Waveform(ts[5], 0.1234, numpy.array([0.5678, 0.9012, 0.3456])), values[5]) def test_union(self): test1 = self.df.filter(self.df.test_name == 'Test-1') test5 = self.df.filter(self.df.test_name == 'Test-5') union = test1.union(test5) union2 = test5.union(test1) self.assertEqual(3, union.count()) self.assertEqual(3, union2.count()) self.assertEqual( set(['Test-1', 'Test-5']), set(union[['test_name']].distinct().toPandas().ix[:, 0].tolist())) self.assertEqual( set(['Test-1', 'Test-5']), set(union2[['test_name']].distinct().toPandas().ix[:, 0].tolist()))