def read_dataframe(self, *args, **kwargs):
        """
        Read Parquet file as a dataframe (costing network requests).

        Pass *args and **kwargs to `fastparquet.ParquetFile.to_pandas()`.

        TODO make this raise OSError/FastparquetCouldNotHandleFile. (Currently
        we return an empty dataframe on error.)
        """
        try:
            return parquet.read(minio.CachedRenderResultsBucket,
                                self.parquet_key, args, kwargs)
        except OSError:
            # Two possibilities:
            #
            # 1. The file is missing.
            # 2. The file is empty. (We used to write empty files in
            #    assign_wf_module.)
            #
            # Either way, our cached DataFrame is "empty", and we represent
            # that as None.
            return pd.DataFrame()
        except parquet.FastparquetCouldNotHandleFile:
            # Treat bugs as "empty file"
            return pd.DataFrame()
 def test_empty_categorical_has_object_dtype(self):
     expected = pd.DataFrame({'A': []}, dtype='str').astype('category')
     assert expected['A'].cat.categories.dtype == object
     try:
         parquet.write(bucket, key, expected)
         result = parquet.read(bucket, key)
     finally:
         minio.remove(bucket, key)
     assert_frame_equal(result, expected)
示例#3
0
    def get_table(self):
        if not self.bucket or not self.key:
            # Old (obsolete) objects have no bucket/key, usually because
            # empty tables weren't being written.
            return pd.DataFrame()

        try:
            return parquet.read(self.bucket, self.key)
        except parquet.FastparquetCouldNotHandleFile:
            return pd.DataFrame()  # empty table
 def test_na_only_categorical_has_object_dtype(self):
     # Start with a Categorical with no values. (In Workbench, all
     # Categoricals are text.)
     expected = pd.DataFrame({'A': [np.nan]}, dtype=str).astype('category')
     assert expected['A'].cat.categories.dtype == object
     try:
         parquet.write(bucket, key, expected)
         result = parquet.read(bucket, key)
     finally:
         minio.remove(bucket, key)
     assert_frame_equal(result, expected)
示例#5
0
    def get_table(self):
        if not self.size:
            return pd.DataFrame()  # empty table

        try:
            return parquet.read(self.file.name)
        except FileNotFoundError:
            # Spotted on production for a duplicated workflow dated
            # 2018-08-01. [adamhooper, 2018-09-20] I can think of no harm in
            # returning an empty dataframe here.
            return pd.DataFrame()  # empty table
    def get_table(self):
        if not self.bucket or not self.key:
            # Old (obsolete) objects have no bucket/key, usually because
            # empty tables weren't being written.
            return pd.DataFrame()

        try:
            return parquet.read(self.bucket, self.key)
        except FileNotFoundError:
            # There was a pre-delete that never got committed; or maybe there's
            # some other, long-existing DB inconsistency.
            return pd.DataFrame()
        except parquet.FastparquetCouldNotHandleFile:
            return pd.DataFrame()  # empty table
示例#7
0
    def get_table(self):
        if not self.file:
            # Before 2018-11-09, we did not write empty data frames.
            #
            # We changed this because #160865813 shows that zero-row Parquet
            # files still contain important data: column information.
            return pd.DataFrame()

        try:
            return parquet.read(self.file.name)
        except (FileNotFoundError, parquet.FastparquetCouldNotHandleFile):
            # Spotted on production for a duplicated workflow dated
            # 2018-08-01. [adamhooper, 2018-09-20] I can think of no harm in
            # returning an empty dataframe here.
            return pd.DataFrame()  # empty table
 def test_read_issue_375_snappy(self):
     with self._file_on_s3('fastparquet-issue-375-snappy.par'):
         with self.assertRaises(parquet.FastparquetIssue375):
             parquet.read(bucket, key)
 def test_read_issue_375_uncompressed(self):
     with self._file_on_s3('fastparquet-issue-375.par'):
         with self.assertRaises(parquet.FastparquetIssue375):
             parquet.read(bucket, key)
示例#10
0
 def test_read_issue_361(self):
     # https://github.com/dask/fastparquet/issues/361
     with self._file_on_s3('fastparquet-issue-361.par'):
         dataframe = parquet.read(bucket, key)
         self.assertEqual(list(dataframe.columns), [])
         self.assertEqual(len(dataframe), 3)
示例#11
0
 def test_read_issue_375_snappy(self):
     with self.assertRaises(parquet.FastparquetIssue375):
         parquet.read(_path('fastparquet-issue-375-snappy.par'))
示例#12
0
 def test_read_issue_375_uncompressed(self):
     with self.assertRaises(parquet.FastparquetIssue375):
         parquet.read(_path('fastparquet-issue-375.par'))
示例#13
0
 def test_read_issue_361(self):
     with self.assertRaises(parquet.FastparquetIssue361):
         parquet.read(_path('fastparquet-issue-361.par'))
示例#14
0
    def get_table(self):
        if not self.size:
            return pd.DataFrame()  # empty table

        return parquet.read(self.file.name)