示例#1
0
 def test_schema_equality(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema1 = DataFrameSchema(schema_name, col_list)
     schema2 = DataFrameSchema(schema_name, col_list[1:])
     schema3 = DataFrameColumn('foo2', col_list)
     self.assertEqual(schema1, schema1)
     self.assertNotEqual(schema1, schema2)
     self.assertNotEqual(schema1, schema3)
示例#2
0
    def test_schema(self):
        schema_name = "foo"
        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
        column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                   [28, 28])
        column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)

        schema = DataFrameSchema(schema_name, [column_1, column_2, column_3])

        self.assertEqual(schema.column_list[0].name, "frame_id")
示例#3
0
    def test_df_metadata(self):
        df_metadata = DataFrameMetadata('name', 'eva_dataset')
        column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
        column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
        col_list = [column_1, column_2]
        schema = DataFrameSchema('name', col_list)
        df_metadata.schema = col_list

        self.assertEqual(df_metadata.name, 'name')
        self.assertEqual(df_metadata.file_url, 'eva_dataset')
        self.assertEqual(df_metadata.id, None)
        self.assertEqual(df_metadata.identifier_column, 'id')
        self.assertEqual(df_metadata.schema, schema)
示例#4
0
 def get_metadata(self,
                  metadata_id: int,
                  col_id_list: List[int] = None) -> DataFrameMetadata:
     """
     This method returns the metadata object given a metadata_id,
     when requested by the executor. It will further be used by storage
     engine for retrieving the dataframe.
     :param metadata_id: metadata id of the table
     :param col_id_list: optional column ids of the table referred
     :return:
     """
     metadata = DataFrameMetadata.get(metadata_id)
     if col_id_list is not None:
         df_columns = DataFrameColumn.get_by_metadata_id_and_id_in(
             col_id_list, metadata_id)
         metadata.set_schema(
             DataFrameSchema(metadata.get_name(), df_columns))
     return metadata
示例#5
0
 def test_df_schema(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema = DataFrameSchema(schema_name, col_list)
     expected_schema = SchemaUtils.get_petastorm_schema(
         schema_name, col_list)
     self.assertEqual(schema.name, schema_name)
     self.assertEqual(schema.column_list, col_list)
     self.assertEqual(schema.petastorm_schema.fields,
                      expected_schema.fields)
     for field1, field2 in zip(schema.petastorm_schema.fields,
                               expected_schema.fields):
         self.assertEqual(field1, field2)
     self.assertEqual(schema.pyspark_schema,
                      expected_schema.as_spark_schema())
示例#6
0
 def schema(self, column_list):
     self._schema = DataFrameSchema(self._name, column_list)