예제 #1
0
 def test_get_petastorm_schema(self, mock_get_pc, mock_uni):
     cols = [MagicMock() for i in range(2)]
     mock_get_pc.side_effect = [1, 2]
     self.assertEqual(SchemaUtils.get_petastorm_schema('name', cols),
                      mock_uni.return_value)
     mock_get_pc.assert_has_calls([call(cols[0]), call(cols[1])])
     mock_uni.assert_called_once_with('name', [1, 2])
예제 #2
0
    def test_get_petastorm_column(self):
        col_name = 'frame_id'
        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
        petastorm_col = UnischemaField(col_name, np.int32, (),
                                       ScalarCodec(IntegerType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
        petastorm_col = UnischemaField(col_name, np.float64, (),
                                       ScalarCodec(FloatType()), True)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
        petastorm_col = UnischemaField(col_name, np.str_, (),
                                       ScalarCodec(StringType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, None, True, [10, 10])
        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
예제 #3
0
 def test_get_petastorm_column_ndarray(self):
     expected_type = [
         np.int8, np.uint8, np.int16, np.int32, np.int64, np.unicode_,
         np.bool_, np.float32, np.float64, Decimal, np.str_, np.datetime64
     ]
     col_name = 'frame_id'
     for array_type, np_type in zip(NdArrayType, expected_type):
         col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                               array_type, [10, 10])
         petastorm_col = UnischemaField(col_name, np_type, [10, 10],
                                        NdarrayCodec(), True)
         self.assertEqual(SchemaUtils.get_petastorm_column(col),
                          petastorm_col)
예제 #4
0
 def test_df_schema(self):
     schema_name = "foo"
     column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False)
     column_2 = DataFrameColumn("frame_data", ColumnType.NDARRAY, False,
                                [28, 28])
     column_3 = DataFrameColumn("frame_label", ColumnType.INTEGER, False)
     col_list = [column_1, column_2, column_3]
     schema = DataFrameSchema(schema_name, col_list)
     expected_schema = SchemaUtils.get_petastorm_schema(
         schema_name, col_list)
     self.assertEqual(schema.name, schema_name)
     self.assertEqual(schema.column_list, col_list)
     self.assertEqual(schema.petastorm_schema.fields,
                      expected_schema.fields)
     for field1, field2 in zip(schema.petastorm_schema.fields,
                               expected_schema.fields):
         self.assertEqual(field1, field2)
     self.assertEqual(schema.pyspark_schema,
                      expected_schema.as_spark_schema())
예제 #5
0
    def exec(self):
        """
        Based on the table it constructs a valid tuple using the values
        provided.
        Right now we assume there are no missing values
        """
        table_id = self.node.video_id
        data_tuple = []
        for col, val in zip(self.node.column_list, self.node.value_list):
            val = val.evaluate()
            val.frames.columns = [col.col_name]
            data_tuple.append(val)

        batch = Batch.merge_column_wise(data_tuple)
        metadata = CatalogManager().get_metadata(table_id)
        # verify value types are consistent

        batch.frames = SchemaUtils.petastorm_type_cast(
            metadata.schema.petastorm_schema, batch.frames)
        StorageEngine.write(metadata, batch)