Exemplo n.º 1
0
    def test_should_load_and_select_in_table(self):
        query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;"""
        perform_query(query)

        select_query = "SELECT id FROM MyVideo;"
        actual_batch = perform_query(select_query)
        expected_rows = [{"id": i} for i in range(NUM_FRAMES)]
        expected_batch = Batch(frames=pd.DataFrame(expected_rows))
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT data FROM MyVideo;"
        actual_batch = perform_query(select_query)
        expected_rows = [{
            "data":
            np.array(np.ones((2, 2, 3)) * 0.1 * float(i + 1) * 255,
                     dtype=np.uint8)
        } for i in range(NUM_FRAMES)]
        expected_batch = Batch(frames=pd.DataFrame(expected_rows))
        self.assertEqual(actual_batch, expected_batch)

        # select * is not supported
        select_query = "SELECT id,data FROM MyVideo;"
        actual_batch = [perform_query(select_query)]
        expected_batch = list(create_dummy_batches())
        self.assertEqual(actual_batch, expected_batch)
Exemplo n.º 2
0
    def test_adding_batch_frame_with_outcomes_returns_new_batch_frame(self):
        batch_1 = Batch(frames=create_dataframe())
        batch_2 = Batch(frames=create_dataframe())

        batch_3 = Batch(frames=create_dataframe_same(2))

        self.assertEqual(batch_3, batch_1 + batch_2)
Exemplo n.º 3
0
    def test_merge_column_wise_batch_frame(self):
        batch_1 = Batch(frames=pd.DataFrame([{'id': 0}]))
        batch_2 = Batch(frames=pd.DataFrame([{'data': 1}]))

        batch_3 = Batch.merge_column_wise([batch_1, batch_2])
        batch_4 = Batch(frames=pd.DataFrame([{'id': 0, 'data': 1}]))
        self.assertEqual(batch_3, batch_4)
Exemplo n.º 4
0
    def test_should_return_smaller_num_rows(self):
        dfs = [
            pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                         columns=list('ABCD')) for _ in range(4)
        ]

        batches = [Batch(frames=df) for df in dfs]

        sample_value = 3

        plan = SamplePlan(ConstantValueExpression(sample_value))

        sample_executor = SampleExecutor(plan)
        sample_executor.append_child(DummyExecutor(batches))
        reduced_batches = list(sample_executor.exec())

        original = Batch.concat(batches)
        filter = range(0, len(original), sample_value)
        original = original._get_frames_from_indices(filter)
        original = Batch.concat([original])

        reduced = Batch.concat(reduced_batches)

        self.assertEqual(len(original), len(reduced))
        self.assertEqual(original, reduced)
Exemplo n.º 5
0
 def test_slicing_on_batched_should_return_new_batch_frame(self):
     batch = Batch(frames=create_dataframe(2),
                   outcomes={'test': [[None], [None]]})
     expected = Batch(frames=create_dataframe(),
                      outcomes={'test': [[None]]})
     self.assertEqual(batch, batch[:])
     self.assertEqual(expected, batch[:-1])
Exemplo n.º 6
0
 def test_set_outcomes_method_should_set_temp_outcome_when_bool_is_true(
         self):
     batch = Batch(frames=create_dataframe())
     batch.set_outcomes('test', [1], is_temp=True)
     expected = Batch(frames=create_dataframe(),
                      temp_outcomes={'test': [1]})
     self.assertEqual(expected, batch)
Exemplo n.º 7
0
    def load(self) -> Iterator[Batch]:
        """
        This is a generator for loading the frames of a video.
         Uses the video metadata and other class arguments

        Yields:
        :obj: `Batch`: An object containing a batch of frames
                                       and record specific metadata
        """

        frames = []
        for record in self._load_frames():
            if self.skip_frames > 0 and record.get(self.identifier_column,
                                                   0) % self.skip_frames != 0:
                continue
            if self.limit and record.get(self.identifier_column,
                                         0) >= self.limit:
                return Batch(pd.DataFrame(frames),
                             identifier_column=self.identifier_column)
            frames.append(record)
            if len(frames) % self.batch_size == 0:
                yield Batch(pd.DataFrame(frames),
                            identifier_column=self.identifier_column)
                frames = []
        if frames:
            return Batch(pd.DataFrame(frames),
                         identifier_column=self.identifier_column)
Exemplo n.º 8
0
    def test_should_return_the_new_path_after_execution(self, mock_class):
        class_instatnce = mock_class.return_value

        dummy_expr = type('dummy_expr', (),
                          {"evaluate": lambda x=None: [True, False, True]})

        # Build plan tree
        video = DataFrameMetadata("dataset", "dummy.avi")
        batch_1 = Batch(pd.DataFrame({'data': [1, 2, 3]}))
        batch_2 = Batch(pd.DataFrame({'data': [4, 5, 6]}))
        class_instatnce.load.return_value = map(lambda x: x,
                                                [batch_1, batch_2])

        storage_plan = StoragePlan(video)
        seq_scan = SeqScanPlan(predicate=dummy_expr, column_ids=[])
        seq_scan.append_child(storage_plan)

        # Execute the plan
        executor = PlanExecutor(seq_scan)
        actual = executor.execute_plan()
        expected = batch_1[::2] + batch_2[::2]

        mock_class.assert_called_once()

        self.assertEqual(expected, actual)
Exemplo n.º 9
0
 def test_when_function_executor_with_a_child_should_allow_chaining(self):
     expression = FunctionExpression(lambda x: pd.DataFrame(x))
     child = FunctionExpression(lambda x: x + 1)
     expression.append_child(child)
     values = Batch(pd.DataFrame([1, 2, 3]))
     actual = expression.evaluate(values)
     expected = Batch(pd.DataFrame([2, 3, 4]))
     self.assertEqual(expected, actual)
Exemplo n.º 10
0
 def test_fetching_frames_by_index_should_also_return_temp_outcomes(self):
     batch = Batch(frames=create_dataframe_same(2),
                   outcomes={'test': [[1], [2]]},
                   temp_outcomes={'test2': [[3], [4]]})
     expected = Batch(frames=create_dataframe(),
                      outcomes={'test': [[1]]},
                      temp_outcomes={'test2': [[3]]})
     self.assertEqual(expected, batch[[0]])
Exemplo n.º 11
0
    def test_should_return_sorted_frames(self):
        """
        data (3 batches):
        'A' 'B' 'C'
        [1, 1, 1]
        ----------
        [1, 5, 6]
        [4, 7, 10]
        ----------
        [2, 9, 7]
        [4, 1, 2]
        [4, 2, 4]
        """

        df1 = pd.DataFrame(np.array([[1, 1, 1]]), columns=['A', 'B', 'C'])
        df2 = pd.DataFrame(np.array([[1, 5, 6], [4, 7, 10]]),
                           columns=['A', 'B', 'C'])
        df3 = pd.DataFrame(np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]),
                           columns=['A', 'B', 'C'])

        batches = [Batch(frames=df) for df in [df1, df2, df3]]

        "query: .... ORDER BY A ASC, B DESC "

        plan = OrderByPlan([
            (TupleValueExpression('A'), ParserOrderBySortType.ASC),
            (TupleValueExpression('B'), ParserOrderBySortType.DESC)
        ])

        orderby_executor = OrderByExecutor(plan)
        orderby_executor.append_child(DummyExecutor(batches))

        sorted_batches = list(orderby_executor.exec())
        """
           A  B   C
        0  1  5   6
        1  1  1   1
        2  2  9   7
        3  4  7  10
        4  4  2   4
        5  4  1   2
        """
        expected_df1 = pd.DataFrame(np.array([[1, 5, 6]]),
                                    columns=['A', 'B', 'C'])
        expected_df2 = pd.DataFrame(np.array([[1, 1, 1], [2, 9, 7]]),
                                    columns=['A', 'B', 'C'])
        expected_df3 = pd.DataFrame(np.array([[4, 7, 10], [4, 2, 4], [4, 1,
                                                                      2]]),
                                    columns=['A', 'B', 'C'])

        expected_batches = [
            Batch(frames=df)
            for df in [expected_df1, expected_df2, expected_df3]
        ]

        self.assertEqual(expected_batches[0], sorted_batches[0])
        self.assertEqual(expected_batches[1], sorted_batches[1])
        self.assertEqual(expected_batches[2], sorted_batches[2])
    def test_should_return_top_frames_after_sorting(self):
        """
        Checks if limit returns the top 2 rows from the data
        after sorting

        data (3 batches):
        'A' 'B' 'C'
        [1, 1, 1]
        ----------
        [1, 5, 6]
        [4, 7, 10]
        ----------
        [2, 9, 7]
        [4, 1, 2]
        [4, 2, 4]
        """

        df1 = pd.DataFrame(
            np.array([[1, 1, 1]]), columns=['A', 'B', 'C'])
        df2 = pd.DataFrame(
            np.array([[1, 5, 6], [4, 7, 10]]), columns=['A', 'B', 'C'])
        df3 = pd.DataFrame(
            np.array([[2, 9, 7], [4, 1, 2],
                      [4, 2, 4]]), columns=['A', 'B', 'C'])

        batches = [Batch(frames=df) for df in [df1, df2, df3]]

        "query: .... ORDER BY A ASC, B DESC limit 2"

        plan = OrderByPlan(
            [(TupleValueExpression('A'), ParserOrderBySortType.ASC),
             (TupleValueExpression('B'), ParserOrderBySortType.DESC)])

        orderby_executor = OrderByExecutor(plan)
        orderby_executor.append_child(DummyExecutor(batches))

        sorted_batches = list(orderby_executor.exec())

        limit_value = 2
        plan = LimitPlan(ConstantValueExpression(limit_value))
        limit_executor = LimitExecutor(plan)
        limit_executor.append_child(DummyExecutor(sorted_batches))
        reduced_batches = list(limit_executor.exec())

        # merge everything into one batch
        aggregated_batch = Batch.concat(reduced_batches, copy=False)
        """
           A  B   C
        0  1  5   6
        1  1  1   1
        """

        expected_df1 = pd.DataFrame(
            np.array([[1, 5, 6], [1, 1, 1]]), columns=['A', 'B', 'C'])

        expected_batches = [Batch(frames=df) for df in [expected_df1]]

        self.assertEqual(expected_batches[0], aggregated_batch)
Exemplo n.º 13
0
 def test_should_update_the_batch_with_outcomes_in_exec_mode(self):
     values = [1, 2, 3]
     expression = FunctionExpression(lambda x: values,
                                     mode=ExecutionMode.EXEC,
                                     name="test")
     expected_batch = Batch(frames=pd.DataFrame(),
                            outcomes={"test": [1, 2, 3]})
     input_batch = Batch(frames=pd.DataFrame())
     expression.evaluate(input_batch)
     self.assertEqual(expected_batch, input_batch)
Exemplo n.º 14
0
    def test_should_load_video_in_table(self):
        query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;"""
        execute_query_fetch_all(query)

        metadata = CatalogManager().get_dataset_metadata("", "MyVideo")
        actual_batch = Batch(pd.DataFrame())
        actual_batch = Batch.concat(StorageEngine.read(metadata), copy=False)
        actual_batch.sort()
        expected_batch = list(create_dummy_batches())
        self.assertEqual([actual_batch], expected_batch)
Exemplo n.º 15
0
 def test_should_update_temp_outcomes_when_is_temp_set_exec_mode(self):
     values = [1, 2, 3]
     expression = FunctionExpression(lambda x: values,
                                     mode=ExecutionMode.EXEC,
                                     name="test",
                                     is_temp=True)
     expected_batch = Batch(frames=pd.DataFrame(),
                            temp_outcomes={"test": [1, 2, 3]})
     input_batch = Batch(frames=pd.DataFrame())
     expression.evaluate(input_batch)
     self.assertEqual(expected_batch, input_batch)
Exemplo n.º 16
0
    def test_should_load_video_in_table(self):
        query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;"""

        perform_query(query)

        metadata = CatalogManager().get_dataset_metadata("", "MyVideo")
        actual_batch = Batch(pd.DataFrame())
        for batch in StorageEngine.read(metadata):
            actual_batch += batch
        actual_batch.sort()
        expected_batch = list(create_dummy_batches())
        self.assertEqual([actual_batch], expected_batch)
Exemplo n.º 17
0
 def evaluate(self, *args, **kwargs):
     batch = self.get_child(0).evaluate(*args, **kwargs)
     if self.etype == ExpressionType.AGGREGATION_SUM:
         return Batch(frames=batch.frames.agg(['sum']))
     elif self.etype == ExpressionType.AGGREGATION_COUNT:
         return Batch(frames=batch.frames.agg(['count']))
     elif self.etype == ExpressionType.AGGREGATION_AVG:
         return Batch(frames=batch.frames.agg(['mean']))
     elif self.etype == ExpressionType.AGGREGATION_MIN:
         return Batch(frames=batch.frames.agg(['min']))
     elif self.etype == ExpressionType.AGGREGATION_MAX:
         return Batch(frames=batch.frames.agg(['max']))
Exemplo n.º 18
0
    def evaluate(self, *args, **kwargs):
        vl = self.get_child(0).evaluate(*args, **kwargs).frames
        vr = self.get_child(1).evaluate(*args, **kwargs).frames

        if self.etype == ExpressionType.ARITHMETIC_ADD:
            return Batch(pd.DataFrame(vl + vr))
        elif self.etype == ExpressionType.ARITHMETIC_SUBTRACT:
            return Batch(pd.DataFrame(vl - vr))
        elif self.etype == ExpressionType.ARITHMETIC_MULTIPLY:
            return Batch(pd.DataFrame(vl * vr))
        elif self.etype == ExpressionType.ARITHMETIC_DIVIDE:
            return Batch(pd.DataFrame(vl / vr))
Exemplo n.º 19
0
    def test_adding_batch_frame_with_outcomes_returns_new_batch_frame(self):
        batch_1 = Batch(frames=create_dataframe(),
                        outcomes={'1': [1]},
                        temp_outcomes={'2': [1]})
        batch_2 = Batch(frames=create_dataframe(),
                        outcomes={'1': [2]},
                        temp_outcomes={'2': [2]})

        batch_3 = Batch(frames=create_dataframe_same(2),
                        outcomes={'1': [1, 2]},
                        temp_outcomes={'2': [1, 2]})

        self.assertEqual(batch_3, batch_1 + batch_2)
Exemplo n.º 20
0
    def evaluate(self, batch: Batch):
        args = []
        if self.get_children_count() > 0:
            child = self.get_child(0)
            args.append(child.evaluate(batch))
        else:
            args.append(batch)

        outcome = self.function(*args)

        if self.mode == ExecutionMode.EXEC:
            batch.set_outcomes(self.name, outcome, is_temp=self.is_temp)
        return outcome
Exemplo n.º 21
0
    def evaluate(self, *args):
        if self.get_children_count() == 2:
            left_values = self.get_child(0).evaluate(*args).frames
            right_values = self.get_child(1).evaluate(*args).frames
            if self.etype == ExpressionType.LOGICAL_AND:
                return Batch(pd.DataFrame(left_values & right_values))
            elif self.etype == ExpressionType.LOGICAL_OR:
                return Batch(pd.DataFrame(left_values | right_values))

        else:
            values = self.get_child(0).evaluate(*args).frames

            if self.etype == ExpressionType.LOGICAL_NOT:
                return Batch(pd.DataFrame(~values))
Exemplo n.º 22
0
    def evaluate(self, batch: Batch):
        new_batch = batch
        child_batches = [child.evaluate(batch) for child in self.children]
        if len(child_batches):
            new_batch = Batch.merge_column_wise(child_batches)

        func = self._gpu_enabled_function()
        outcomes = func(new_batch.frames)
        outcomes = Batch(pd.DataFrame(outcomes))

        if self._output:
            return outcomes.project([self._output])
        else:
            return outcomes
Exemplo n.º 23
0
    def test_has_outcomes_returns_true_if_the_given_name_is_in_outcomes(self):
        batch = Batch(frames=create_dataframe())
        batch.set_outcomes('test_temp', [1], is_temp=True)
        batch.set_outcomes('test', [1])

        self.assertTrue(batch.has_outcome('test'))
        self.assertTrue(batch.has_outcome('test_temp'))
    def test_should_return_only_frames_satisfy_predicate(self):
        dataframe = create_dataframe(3)
        batch = Batch(frames=dataframe)
        expression = type("AbstractExpression", (),
                          {"evaluate": lambda x: Batch(
                              pd.DataFrame([False, False, True]))})

        plan = type("ScanPlan", (), {"predicate": expression,
                                     "columns": None})
        predicate_executor = SequentialScanExecutor(plan)
        predicate_executor.append_child(DummyExecutor([batch]))

        expected = Batch(batch[[2]].frames.reset_index(drop=True))
        filtered = list(predicate_executor.exec())[0]
        self.assertEqual(expected, filtered)
Exemplo n.º 25
0
    def test_execute_plan_for_pp_scan_plan(self, mock_clean, mock_build):
        # PPExecutor
        tree = MagicMock(node=PPScanPlan(None))
        tree.exec.return_value = [
            Batch(pd.DataFrame([1])),
            Batch(pd.DataFrame([2])),
            Batch(pd.DataFrame([3]))
        ]
        mock_build.return_value = tree

        actual = PlanExecutor(None).execute_plan()
        mock_build.assert_called_once_with(None)
        mock_clean.assert_called_once()
        tree.exec.assert_called_once()
        self.assertEqual(actual, Batch(pd.DataFrame([[1], [2], [3]])))
Exemplo n.º 26
0
    def exec(self) -> Iterator[Batch]:
        child_executor = self.children[0]
        aggregated_batch_list = []

        # aggregates the batches into one large batch
        for batch in child_executor.exec():
            self.batch_sizes.append(batch.batch_size)
            aggregated_batch_list.append(batch)
        aggregated_batch = Batch.concat(aggregated_batch_list, copy=False)

        # sorts the batch
        try:
            aggregated_batch.sort_orderby(by=self.extract_column_names(),
                                          sort_type=self.extract_sort_types())
        except KeyError:
            # pass for now
            pass

        # split the aggregated batch into smaller ones based
        #  on self.batch_sizes which holds the input batches sizes
        index = 0
        for i in self.batch_sizes:
            batch = aggregated_batch[index:index + i]
            batch.reset_index()
            index += i
            yield batch
    def test_should_return_limit_greater_than_size(self):
        """ This should return the exact same data
        if the limit value is greater than what is present.
        This will also leave a warning """

        dfs = [pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                            columns=list('ABCD')) for _ in range(4)]

        batches = [Batch(frames=df) for df in dfs]

        previous_total_size = 0
        for batch in batches:
            previous_total_size += batch.batch_size

        limit_value = 500

        plan = LimitPlan(ConstantValueExpression(limit_value))

        limit_executor = LimitExecutor(plan)
        limit_executor.append_child(DummyExecutor(batches))
        reduced_batches = list(limit_executor.exec())

        after_total_size = 0
        for batch in reduced_batches:
            after_total_size += batch.batch_size

        self.assertEqual(previous_total_size, after_total_size)
Exemplo n.º 28
0
    def write(self, table: DataFrameMetadata, rows: Batch):
        """
        Write rows into the dataframe.

        Arguments:
            table: table metadata object to write into
            rows : batch to be persisted in the storage.
        """

        if rows.empty():
            return
        # ToDo
        # Throw an error if the row schema doesn't match the table schema

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            records = rows.frames
            columns = records.keys()
            rows_rdd = self.spark_context.parallelize(records.values) \
                .map(lambda x: dict(zip(columns, x))) \
                .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema,
                                                 x))
            self.spark_session.createDataFrame(rows_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('append') \
                .parquet(self._spark_url(table))
Exemplo n.º 29
0
    def test_select_and_where_video_in_table(self):
        select_query = "SELECT id,data FROM MyVideo WHERE id = 5;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_batch = list(create_dummy_batches(filters=[5]))[0]
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT data FROM MyVideo WHERE id = 5;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_rows = [{
            "data":
            np.array(np.ones((2, 2, 3)) * float(5 + 1) * 25, dtype=np.uint8)
        }]
        expected_batch = Batch(frames=pd.DataFrame(expected_rows))
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT id, data FROM MyVideo WHERE id >= 2;"
        actual_batch = execute_query_fetch_all(select_query)
        actual_batch.sort()
        expected_batch = list(
            create_dummy_batches(filters=range(2, NUM_FRAMES)))[0]
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT id, data FROM MyVideo WHERE id >= 2 AND id < 5;"
        actual_batch = execute_query_fetch_all(select_query)
        actual_batch.sort()
        expected_batch = list(create_dummy_batches(filters=range(2, 5)))[0]

        self.assertEqual(actual_batch, expected_batch)
Exemplo n.º 30
0
    def test_should_load_and_sort_in_table(self):
        select_query = "SELECT data, id FROM MyVideo ORDER BY id;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_rows = [{
            'id':
            i,
            'data':
            np.array(np.ones((2, 2, 3)) * float(i + 1) * 25, dtype=np.uint8)
        } for i in range(NUM_FRAMES)]
        expected_batch = Batch(frames=pd.DataFrame(expected_rows))
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT data, id FROM MyVideo ORDER BY id DESC;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_batch.reverse()
        self.assertEqual(actual_batch, expected_batch)