def test_schema_equality(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema1 = DataFrameSchema(schema_name, col_list) schema2 = DataFrameSchema(schema_name, col_list[1:]) schema3 = DataFrameColumn('foo2', col_list) self.assertEqual(schema1, schema1) self.assertNotEqual(schema1, schema2) self.assertNotEqual(schema1, schema3)
def test_schema(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) schema = DataFrameSchema(schema_name, [column_1, column_2, column_3]) self.assertEqual(schema.column_list[0].name, "frame_id")
def test_df_metadata(self): df_metadata = DataFrameMetadata('name', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] schema = DataFrameSchema('name', col_list) df_metadata.schema = col_list self.assertEqual(df_metadata.name, 'name') self.assertEqual(df_metadata.file_url, 'eva_dataset') self.assertEqual(df_metadata.id, None) self.assertEqual(df_metadata.identifier_column, 'id') self.assertEqual(df_metadata.schema, schema)
def get_metadata(self, metadata_id: int, col_id_list: List[int] = None) -> DataFrameMetadata: """ This method returns the metadata object given a metadata_id, when requested by the executor. It will further be used by storage engine for retrieving the dataframe. :param metadata_id: metadata id of the table :param col_id_list: optional column ids of the table referred :return: """ metadata = DataFrameMetadata.get(metadata_id) if col_id_list is not None: df_columns = DataFrameColumn.get_by_metadata_id_and_id_in( col_id_list, metadata_id) metadata.set_schema( DataFrameSchema(metadata.get_name(), df_columns)) return metadata
def test_df_schema(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema = DataFrameSchema(schema_name, col_list) expected_schema = SchemaUtils.get_petastorm_schema( schema_name, col_list) self.assertEqual(schema.name, schema_name) self.assertEqual(schema.column_list, col_list) self.assertEqual(schema.petastorm_schema.fields, expected_schema.fields) for field1, field2 in zip(schema.petastorm_schema.fields, expected_schema.fields): self.assertEqual(field1, field2) self.assertEqual(schema.pyspark_schema, expected_schema.as_spark_schema())
def schema(self, column_list): self._schema = DataFrameSchema(self._name, column_list)