def test_get_petastorm_schema(self, mock_get_pc, mock_uni): cols = [MagicMock() for i in range(2)] mock_get_pc.side_effect = [1, 2] self.assertEqual(SchemaUtils.get_petastorm_schema('name', cols), mock_uni.return_value) mock_get_pc.assert_has_calls([call(cols[0]), call(cols[1])]) mock_uni.assert_called_once_with('name', [1, 2])
def test_get_petastorm_column(self): col_name = 'frame_id' col = DataFrameColumn(col_name, ColumnType.INTEGER, False) petastorm_col = UnischemaField(col_name, np.int32, (), ScalarCodec(IntegerType()), False) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, ColumnType.FLOAT, True) petastorm_col = UnischemaField(col_name, np.float64, (), ScalarCodec(FloatType()), True) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, ColumnType.TEXT, False) petastorm_col = UnischemaField(col_name, np.str_, (), ScalarCodec(StringType()), False) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col) col = DataFrameColumn(col_name, None, True, [10, 10]) self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
def test_get_petastorm_column_ndarray(self): expected_type = [ np.int8, np.uint8, np.int16, np.int32, np.int64, np.unicode_, np.bool_, np.float32, np.float64, Decimal, np.str_, np.datetime64 ] col_name = 'frame_id' for array_type, np_type in zip(NdArrayType, expected_type): col = DataFrameColumn(col_name, ColumnType.NDARRAY, True, array_type, [10, 10]) petastorm_col = UnischemaField(col_name, np_type, [10, 10], NdarrayCodec(), True) self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)
def test_df_schema(self): schema_name = "foo" column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False, [28, 28]) column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2, column_3] schema = DataFrameSchema(schema_name, col_list) expected_schema = SchemaUtils.get_petastorm_schema( schema_name, col_list) self.assertEqual(schema.name, schema_name) self.assertEqual(schema.column_list, col_list) self.assertEqual(schema.petastorm_schema.fields, expected_schema.fields) for field1, field2 in zip(schema.petastorm_schema.fields, expected_schema.fields): self.assertEqual(field1, field2) self.assertEqual(schema.pyspark_schema, expected_schema.as_spark_schema())
def exec(self): """ Based on the table it constructs a valid tuple using the values provided. Right now we assume there are no missing values """ table_id = self.node.video_id data_tuple = [] for col, val in zip(self.node.column_list, self.node.value_list): val = val.evaluate() val.frames.columns = [col.col_name] data_tuple.append(val) batch = Batch.merge_column_wise(data_tuple) metadata = CatalogManager().get_metadata(table_id) # verify value types are consistent batch.frames = SchemaUtils.petastorm_type_cast( metadata.schema.petastorm_schema, batch.frames) StorageEngine.write(metadata, batch)