def test_read_file_fastparquet_issue_375(self): file = os.path.join(os.path.dirname(__file__), '..', 'test_data', 'fastparquet-issue-375-snappy.par') minio_client.fput_object(StoredObjectsBucket, 'fastparquet-issue-375-snappy.par', file) so = StoredObject(size=10, bucket=StoredObjectsBucket, key='fastparquet-issue-375-snappy.par') assert_frame_equal(so.get_table(), pd.DataFrame())
def test_create_table_if_different(self): so1 = StoredObject.create_table(self.wfm1, mock_csv_table) so2 = StoredObject.create_table_if_different(self.wfm1, so1, mock_csv_table) self.assertIsNone(so2) so3 = StoredObject.create_table_if_different(self.wfm1, so1, mock_csv_table2) self.assertIsNotNone(so3) table3 = so3.get_table() self.assertTrue(table3.equals(mock_csv_table2))
def test_wf_module_data_versions(self): table1 = mock_csv_table table2 = mock_csv_table2 # nothing ever stored nothing = self.wfmodule1.retrieve_fetched_table() self.assertIsNone(nothing) # save and recover data firstver = self.wfmodule1.store_fetched_table(table1) self.wfmodule1.save() self.wfmodule1.refresh_from_db() self.assertNotEqual(self.wfmodule1.get_fetched_data_version(), firstver) # should not switch versions by itself self.assertIsNone(self.wfmodule1.retrieve_fetched_table() ) # no stored version, no table self.wfmodule1.set_fetched_data_version(firstver) self.assertEqual(self.wfmodule1.get_fetched_data_version(), firstver) tableout1 = self.wfmodule1.retrieve_fetched_table() self.assertTrue(tableout1.equals(table1)) # create another version secondver = self.wfmodule1.store_fetched_table(table2) self.assertNotEqual(self.wfmodule1.get_fetched_data_version(), secondver) # should not switch versions by itself self.wfmodule1.set_fetched_data_version(secondver) self.assertNotEqual(firstver, secondver) tableout2 = self.wfmodule1.retrieve_fetched_table() self.assertTrue(tableout2.equals(table2)) # change the version back self.wfmodule1.set_fetched_data_version(firstver) tableout1 = self.wfmodule1.retrieve_fetched_table() self.assertTrue(tableout1.equals(table1)) # invalid version string should error with self.assertRaises(ValidationError): self.wfmodule1.set_fetched_data_version('foo') # list versions verlist = self.wfmodule1.list_fetched_data_versions() correct_verlist = [secondver, firstver] # sorted by creation date, latest first self.assertListEqual([ver[0] for ver in verlist], correct_verlist) # Cached tables should not appear in listed data versions. StoredObject.create_table(self.wfmodule1, StoredObject.CACHED_TABLE, table1) verlist = self.wfmodule1.list_fetched_data_versions() self.assertListEqual([ver[0] for ver in verlist], correct_verlist) # but like, none of this should have created versions on any other wfmodule self.assertEqual(self.wfmodule2.list_fetched_data_versions(), [])
def test_read_file_fastparquet_issue_375(self): path = ( Path(__file__).parent.parent / 'test_data' / 'fastparquet-issue-375-snappy.par' ) minio.fput_file(minio.StoredObjectsBucket, 'fastparquet-issue-375-snappy.par', path) so = StoredObject( size=10, bucket=minio.StoredObjectsBucket, key='fastparquet-issue-375-snappy.par' ) assert_frame_equal(so.get_table(), pd.DataFrame())
def test_create_table_if_different(self): df1 = pd.DataFrame({"A": [1]}) df2 = pd.DataFrame({"A": [2]}) so1 = StoredObject.create_table(self.wfm1, df1) so2 = StoredObject.create_table_if_different(self.wfm1, so1, df1) self.assertIsNone(so2) so3 = StoredObject.create_table_if_different(self.wfm1, so1, df2) self.assertIsNotNone(so3) table3 = so3.get_table() assert_frame_equal(table3, df2)
def test_store_empty_table(self): so1 = StoredObject.create_table(self.wfm1, pd.DataFrame(), metadata=self.metadata) self.assertEqual(so1.metadata, self.metadata) table2 = so1.get_table() self.assertTrue(table2.empty)
def test_load_missing_file(self): """ An aborted delete leaves a StoredObject without a backing file. """ test_table = pd.DataFrame( { "A": pd.Series([1, 2, 3], dtype=np.int64), "B": pd.Series([1, 2, 3], dtype=np.float64), "C": pd.Series(["x", np.nan, "y"], dtype=object), "D": pd.Series(["x", np.nan, "x"], dtype="category"), "E": pd.Series([datetime.now(), np.nan, datetime.now()]), } ) so1 = StoredObject.create_table(self.wfm1, test_table) try: with transaction.atomic(): # 1. Get Django's pre-delete to delete the file from S3 so1.delete() # 2. Rollback raise RuntimeError("not really an error") except RuntimeError: pass table = so1.get_table() assert_frame_equal(table, pd.DataFrame())
def test_nan_storage(self): # have previously run into problems serializing/deserializing NaN table = pd.DataFrame({ 'M': [10, np.nan, 11, 20], }, dtype=np.float64) so = StoredObject.create_table(self.wfm1, table) assert_frame_equal(so.get_table(), table)
def test_duplicate(self): # Duplicate from one wfm to another, tests the typical WfModule duplication case so1 = StoredObject.create(self.wfm1, "Stored Text") so2 = so1.duplicate(self.wfm2) # new StoredObject should have same time, different file with same contents self.assertEqual(so1.stored_at, so2.stored_at) self.assertNotEqual(so1.file, so2.file) self.assertEqual(so1.get_data(), so2.get_data())
def test_store_some_random_table(self): test_table = pd.DataFrame({ 'A': pd.Series([1, 2, 3], dtype=np.int64), 'B': pd.Series([1, 2, 3], dtype=np.float64), 'C': pd.Series(['x', np.nan, 'y'], dtype=object), 'D': pd.Series(['x', np.nan, 'x'], dtype='category'), 'E': pd.Series([datetime.now(), np.nan, datetime.now()]), }) so1 = StoredObject.create_table(self.wfm1, test_table) table2 = so1.get_table() self.assertTrue(table2.equals(test_table))
def test_duplicate_table(self): so1 = StoredObject.create_table(self.wfm1, mock_csv_table) so2 = so1.duplicate(self.wfm2) # new StoredObject should have same time, same metadata, different file with same contents self.assertEqual(so1.stored_at, so2.stored_at) self.assertEqual(so1.metadata, so2.metadata) self.assertNotEqual(so1.file, so2.file) self.assertEqual(self.file_contents(so1.file), self.file_contents(so2.file)) self.assertTrue(so1.get_table().equals(so2.get_table()))
def test_store_some_random_table(self): test_table = pd.DataFrame( { "A": pd.Series([1, 2, 3], dtype=np.int64), "B": pd.Series([1, 2, 3], dtype=np.float64), "C": pd.Series(["x", np.nan, "y"], dtype=object), "D": pd.Series(["x", np.nan, "x"], dtype="category"), "E": pd.Series([datetime.now(), np.nan, datetime.now()]), } ) so1 = StoredObject.create_table(self.wfm1, test_table) table2 = so1.get_table() self.assertTrue(table2.equals(test_table))
def test_duplicate_table(self): table = pd.DataFrame({'A': [1]}) self.wfm2 = self.workflow.wf_modules.create(order=1) so1 = StoredObject.create_table(self.wfm1, table) so2 = so1.duplicate(self.wfm2) # new StoredObject should have same time, same metadata, different file with same contents self.assertEqual(so1.stored_at, so2.stored_at) self.assertEqual(so1.metadata, so2.metadata) self.assertNotEqual(so1.file, so2.file) self.assertEqual(self.file_contents(so1.file), self.file_contents(so2.file))
def test_duplicate_table(self): table = pd.DataFrame({"A": [1]}) self.wfm2 = self.wfm1.tab.wf_modules.create(order=1, slug="step-2") so1 = StoredObject.create_table(self.wfm1, table) so2 = so1.duplicate(self.wfm2) # new StoredObject should have same time, # different file with same contents self.assertEqual(so1.stored_at, so2.stored_at) self.assertEqual(so1.size, so2.size) self.assertEqual(so1.bucket, so2.bucket) self.assertNotEqual(so1.key, so2.key) assert_frame_equal(so2.get_table(), table)
def test_store_some_random_table(self): # Use a more realistic test table with lots of data of different types # mock data wasn't finding bugs related to dict-type columns fname = os.path.join(settings.BASE_DIR, 'server/tests/test_data/sfpd.json') with open(fname) as f: sfpd = json.load(f) self.test_table = pd.DataFrame(sfpd) sanitize_dataframe(self.test_table) so1 = StoredObject.create_table(self.wfm1, self.test_table, self.metadata) self.assertEqual(so1.metadata, self.metadata) table2 = so1.get_table() self.assertTrue(table2.equals(self.test_table))
def test_nan_storage(self): # have previously run into problems serializing / deserializing NaN values test_csv = 'Class,M,F\n' \ 'math,10,12\n' \ 'english,,7\n' \ 'history,11,13\n' \ 'economics,20,20' test_table = pd.read_csv(io.StringIO(test_csv)) test_table_M = pd.DataFrame( test_table['M'] ) # need DataFrame ctor otherwise we get series not df so = StoredObject.create_table(self.wfm1, test_table_M) table_out = so.get_table() self.assertTrue(table_out.equals(test_table_M))
def test_store_fetched_table(self): so1 = StoredObject.create_table(self.wfm1, self.test_table, self.metadata) self.assertEqual(so1.metadata, self.metadata) table2 = so1.get_table() self.assertTrue(table2.equals(self.test_table))
def test_read_file_missing(self): so = StoredObject(file='hello', size=10) assert_frame_equal(so.get_table(), pd.DataFrame())
def test_store_empty_table(self): so1 = StoredObject.create_table(self.wfm1, pd.DataFrame()) table2 = so1.get_table() self.assertTrue(table2.empty)
def test_read_file_fastparquet_issue_375(self): so = StoredObject(file=os.path.join( os.path.dirname(__file__), '..', 'test_data', 'fastparquet-issue-375-snappy.par'), size=10) assert_frame_equal(so.get_table(), pd.DataFrame())