예제 #1
0
    def test_read_file_fastparquet_issue_375(self):
        file = os.path.join(os.path.dirname(__file__), '..', 'test_data',
                            'fastparquet-issue-375-snappy.par')
        minio_client.fput_object(StoredObjectsBucket,
                                 'fastparquet-issue-375-snappy.par', file)

        so = StoredObject(size=10,
                          bucket=StoredObjectsBucket,
                          key='fastparquet-issue-375-snappy.par')
        assert_frame_equal(so.get_table(), pd.DataFrame())
예제 #2
0
    def test_create_table_if_different(self):
        so1 = StoredObject.create_table(self.wfm1, mock_csv_table)

        so2 = StoredObject.create_table_if_different(self.wfm1, so1,
                                                     mock_csv_table)
        self.assertIsNone(so2)

        so3 = StoredObject.create_table_if_different(self.wfm1, so1,
                                                     mock_csv_table2)
        self.assertIsNotNone(so3)
        table3 = so3.get_table()
        self.assertTrue(table3.equals(mock_csv_table2))
예제 #3
0
    def test_wf_module_data_versions(self):
        table1 = mock_csv_table
        table2 = mock_csv_table2

        # nothing ever stored
        nothing = self.wfmodule1.retrieve_fetched_table()
        self.assertIsNone(nothing)

        # save and recover data
        firstver = self.wfmodule1.store_fetched_table(table1)
        self.wfmodule1.save()
        self.wfmodule1.refresh_from_db()
        self.assertNotEqual(self.wfmodule1.get_fetched_data_version(),
                            firstver)  # should not switch versions by itself
        self.assertIsNone(self.wfmodule1.retrieve_fetched_table()
                          )  # no stored version, no table
        self.wfmodule1.set_fetched_data_version(firstver)
        self.assertEqual(self.wfmodule1.get_fetched_data_version(), firstver)
        tableout1 = self.wfmodule1.retrieve_fetched_table()
        self.assertTrue(tableout1.equals(table1))

        # create another version
        secondver = self.wfmodule1.store_fetched_table(table2)
        self.assertNotEqual(self.wfmodule1.get_fetched_data_version(),
                            secondver)  # should not switch versions by itself
        self.wfmodule1.set_fetched_data_version(secondver)
        self.assertNotEqual(firstver, secondver)
        tableout2 = self.wfmodule1.retrieve_fetched_table()
        self.assertTrue(tableout2.equals(table2))

        # change the version back
        self.wfmodule1.set_fetched_data_version(firstver)
        tableout1 = self.wfmodule1.retrieve_fetched_table()
        self.assertTrue(tableout1.equals(table1))

        # invalid version string should error
        with self.assertRaises(ValidationError):
            self.wfmodule1.set_fetched_data_version('foo')

        # list versions
        verlist = self.wfmodule1.list_fetched_data_versions()
        correct_verlist = [secondver,
                           firstver]  # sorted by creation date, latest first
        self.assertListEqual([ver[0] for ver in verlist], correct_verlist)

        # Cached tables should not appear in listed data versions.
        StoredObject.create_table(self.wfmodule1, StoredObject.CACHED_TABLE,
                                  table1)
        verlist = self.wfmodule1.list_fetched_data_versions()
        self.assertListEqual([ver[0] for ver in verlist], correct_verlist)

        # but like, none of this should have created versions on any other wfmodule
        self.assertEqual(self.wfmodule2.list_fetched_data_versions(), [])
예제 #4
0
    def test_read_file_fastparquet_issue_375(self):
        path = (
            Path(__file__).parent.parent
            / 'test_data' / 'fastparquet-issue-375-snappy.par'
        )
        minio.fput_file(minio.StoredObjectsBucket,
                        'fastparquet-issue-375-snappy.par', path)

        so = StoredObject(
            size=10,
            bucket=minio.StoredObjectsBucket,
            key='fastparquet-issue-375-snappy.par'
        )
        assert_frame_equal(so.get_table(), pd.DataFrame())
예제 #5
0
    def test_create_table_if_different(self):
        df1 = pd.DataFrame({"A": [1]})
        df2 = pd.DataFrame({"A": [2]})

        so1 = StoredObject.create_table(self.wfm1, df1)

        so2 = StoredObject.create_table_if_different(self.wfm1, so1, df1)
        self.assertIsNone(so2)

        so3 = StoredObject.create_table_if_different(self.wfm1, so1, df2)
        self.assertIsNotNone(so3)

        table3 = so3.get_table()
        assert_frame_equal(table3, df2)
예제 #6
0
 def test_store_empty_table(self):
     so1 = StoredObject.create_table(self.wfm1,
                                     pd.DataFrame(),
                                     metadata=self.metadata)
     self.assertEqual(so1.metadata, self.metadata)
     table2 = so1.get_table()
     self.assertTrue(table2.empty)
예제 #7
0
    def test_load_missing_file(self):
        """
        An aborted delete leaves a StoredObject without a backing file.
        """
        test_table = pd.DataFrame(
            {
                "A": pd.Series([1, 2, 3], dtype=np.int64),
                "B": pd.Series([1, 2, 3], dtype=np.float64),
                "C": pd.Series(["x", np.nan, "y"], dtype=object),
                "D": pd.Series(["x", np.nan, "x"], dtype="category"),
                "E": pd.Series([datetime.now(), np.nan, datetime.now()]),
            }
        )
        so1 = StoredObject.create_table(self.wfm1, test_table)
        try:
            with transaction.atomic():
                # 1. Get Django's pre-delete to delete the file from S3
                so1.delete()
                # 2. Rollback
                raise RuntimeError("not really an error")
        except RuntimeError:
            pass

        table = so1.get_table()
        assert_frame_equal(table, pd.DataFrame())
예제 #8
0
    def test_nan_storage(self):
        # have previously run into problems serializing/deserializing NaN
        table = pd.DataFrame({
            'M': [10, np.nan, 11, 20],
        }, dtype=np.float64)

        so = StoredObject.create_table(self.wfm1, table)
        assert_frame_equal(so.get_table(), table)
예제 #9
0
    def test_duplicate(self):
        # Duplicate from one wfm to another, tests the typical WfModule duplication case
        so1 = StoredObject.create(self.wfm1, "Stored Text")
        so2 = so1.duplicate(self.wfm2)

        # new StoredObject should have same time, different file with same contents
        self.assertEqual(so1.stored_at, so2.stored_at)
        self.assertNotEqual(so1.file, so2.file)
        self.assertEqual(so1.get_data(), so2.get_data())
예제 #10
0
 def test_store_some_random_table(self):
     test_table = pd.DataFrame({
         'A': pd.Series([1, 2, 3], dtype=np.int64),
         'B': pd.Series([1, 2, 3], dtype=np.float64),
         'C': pd.Series(['x', np.nan, 'y'], dtype=object),
         'D': pd.Series(['x', np.nan, 'x'], dtype='category'),
         'E': pd.Series([datetime.now(), np.nan, datetime.now()]),
     })
     so1 = StoredObject.create_table(self.wfm1, test_table)
     table2 = so1.get_table()
     self.assertTrue(table2.equals(test_table))
예제 #11
0
    def test_duplicate_table(self):
        so1 = StoredObject.create_table(self.wfm1, mock_csv_table)
        so2 = so1.duplicate(self.wfm2)

        # new StoredObject should have same time, same metadata, different file with same contents
        self.assertEqual(so1.stored_at, so2.stored_at)
        self.assertEqual(so1.metadata, so2.metadata)
        self.assertNotEqual(so1.file, so2.file)

        self.assertEqual(self.file_contents(so1.file),
                         self.file_contents(so2.file))
        self.assertTrue(so1.get_table().equals(so2.get_table()))
예제 #12
0
 def test_store_some_random_table(self):
     test_table = pd.DataFrame(
         {
             "A": pd.Series([1, 2, 3], dtype=np.int64),
             "B": pd.Series([1, 2, 3], dtype=np.float64),
             "C": pd.Series(["x", np.nan, "y"], dtype=object),
             "D": pd.Series(["x", np.nan, "x"], dtype="category"),
             "E": pd.Series([datetime.now(), np.nan, datetime.now()]),
         }
     )
     so1 = StoredObject.create_table(self.wfm1, test_table)
     table2 = so1.get_table()
     self.assertTrue(table2.equals(test_table))
예제 #13
0
    def test_duplicate_table(self):
        table = pd.DataFrame({'A': [1]})

        self.wfm2 = self.workflow.wf_modules.create(order=1)
        so1 = StoredObject.create_table(self.wfm1, table)
        so2 = so1.duplicate(self.wfm2)

        # new StoredObject should have same time, same metadata, different file with same contents
        self.assertEqual(so1.stored_at, so2.stored_at)
        self.assertEqual(so1.metadata, so2.metadata)
        self.assertNotEqual(so1.file, so2.file)

        self.assertEqual(self.file_contents(so1.file), self.file_contents(so2.file))
예제 #14
0
    def test_duplicate_table(self):
        table = pd.DataFrame({"A": [1]})

        self.wfm2 = self.wfm1.tab.wf_modules.create(order=1, slug="step-2")
        so1 = StoredObject.create_table(self.wfm1, table)
        so2 = so1.duplicate(self.wfm2)

        # new StoredObject should have same time,
        # different file with same contents
        self.assertEqual(so1.stored_at, so2.stored_at)
        self.assertEqual(so1.size, so2.size)
        self.assertEqual(so1.bucket, so2.bucket)
        self.assertNotEqual(so1.key, so2.key)
        assert_frame_equal(so2.get_table(), table)
예제 #15
0
    def test_store_some_random_table(self):
        # Use a more realistic test table with lots of data of different types
        # mock data wasn't finding bugs related to dict-type columns
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        with open(fname) as f:
            sfpd = json.load(f)
        self.test_table = pd.DataFrame(sfpd)
        sanitize_dataframe(self.test_table)

        so1 = StoredObject.create_table(self.wfm1, self.test_table,
                                        self.metadata)
        self.assertEqual(so1.metadata, self.metadata)
        table2 = so1.get_table()
        self.assertTrue(table2.equals(self.test_table))
예제 #16
0
    def test_nan_storage(self):
        # have previously run into problems serializing / deserializing NaN values
        test_csv = 'Class,M,F\n' \
                   'math,10,12\n' \
                   'english,,7\n' \
                   'history,11,13\n' \
                   'economics,20,20'
        test_table = pd.read_csv(io.StringIO(test_csv))
        test_table_M = pd.DataFrame(
            test_table['M']
        )  # need DataFrame ctor otherwise we get series not df

        so = StoredObject.create_table(self.wfm1, test_table_M)
        table_out = so.get_table()
        self.assertTrue(table_out.equals(test_table_M))
예제 #17
0
 def test_store_fetched_table(self):
     so1 = StoredObject.create_table(self.wfm1, self.test_table,
                                     self.metadata)
     self.assertEqual(so1.metadata, self.metadata)
     table2 = so1.get_table()
     self.assertTrue(table2.equals(self.test_table))
 def test_read_file_missing(self):
     so = StoredObject(file='hello', size=10)
     assert_frame_equal(so.get_table(), pd.DataFrame())
예제 #19
0
 def test_store_empty_table(self):
     so1 = StoredObject.create_table(self.wfm1, pd.DataFrame())
     table2 = so1.get_table()
     self.assertTrue(table2.empty)
 def test_read_file_fastparquet_issue_375(self):
     so = StoredObject(file=os.path.join(
         os.path.dirname(__file__), '..', 'test_data',
         'fastparquet-issue-375-snappy.par'),
                       size=10)
     assert_frame_equal(so.get_table(), pd.DataFrame())